1/*
2 * Copyright © 2010-2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Chris Wilson <chris@chris-wilson.co.uk>
25 *
26 */
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "sna.h"
33#include "sna_render.h"
34#include "sna_render_inline.h"
35#include "sna_reg.h"
36#include "sna_video.h"
37
38#include "gen3_render.h"
39
40#define NO_COMPOSITE 0
41#define NO_COMPOSITE_SPANS 0
42#define NO_COPY 0
43#define NO_COPY_BOXES 0
44#define NO_FILL 0
45#define NO_FILL_ONE 0
46#define NO_FILL_BOXES 0
47
48#define PREFER_BLT_FILL 1
49
50enum {
51	SHADER_NONE = 0,
52	SHADER_ZERO,
53	SHADER_BLACK,
54	SHADER_WHITE,
55	SHADER_CONSTANT,
56	SHADER_LINEAR,
57	SHADER_RADIAL,
58	SHADER_TEXTURE,
59	SHADER_OPACITY,
60};
61
62#define MAX_3D_SIZE 2048
63#define MAX_3D_PITCH 8192
64
65#define OUT_BATCH(v) batch_emit(sna, v)
66#define OUT_BATCH_F(v) batch_emit_float(sna, v)
67#define OUT_VERTEX(v) vertex_emit(sna, v)
68
69enum gen3_radial_mode {
70	RADIAL_ONE,
71	RADIAL_TWO
72};
73
74static const struct blendinfo {
75	bool dst_alpha;
76	bool src_alpha;
77	uint32_t src_blend;
78	uint32_t dst_blend;
79} gen3_blend_op[] = {
80	/* Clear */	{0, 0, BLENDFACT_ZERO, BLENDFACT_ZERO},
81	/* Src */	{0, 0, BLENDFACT_ONE, BLENDFACT_ZERO},
82	/* Dst */	{0, 0, BLENDFACT_ZERO, BLENDFACT_ONE},
83	/* Over */	{0, 1, BLENDFACT_ONE, BLENDFACT_INV_SRC_ALPHA},
84	/* OverReverse */ {1, 0, BLENDFACT_INV_DST_ALPHA, BLENDFACT_ONE},
85	/* In */	{1, 0, BLENDFACT_DST_ALPHA, BLENDFACT_ZERO},
86	/* InReverse */ {0, 1, BLENDFACT_ZERO, BLENDFACT_SRC_ALPHA},
87	/* Out */	{1, 0, BLENDFACT_INV_DST_ALPHA, BLENDFACT_ZERO},
88	/* OutReverse */ {0, 1, BLENDFACT_ZERO, BLENDFACT_INV_SRC_ALPHA},
89	/* Atop */	{1, 1, BLENDFACT_DST_ALPHA, BLENDFACT_INV_SRC_ALPHA},
90	/* AtopReverse */ {1, 1, BLENDFACT_INV_DST_ALPHA, BLENDFACT_SRC_ALPHA},
91	/* Xor */	{1, 1, BLENDFACT_INV_DST_ALPHA, BLENDFACT_INV_SRC_ALPHA},
92	/* Add */	{0, 0, BLENDFACT_ONE, BLENDFACT_ONE},
93};
94
95#define S6_COLOR_WRITE_ONLY \
96	(S6_COLOR_WRITE_ENABLE | \
97	 BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT | \
98	 BLENDFACT_ONE << S6_CBUF_SRC_BLEND_FACT_SHIFT | \
99	 BLENDFACT_ZERO << S6_CBUF_DST_BLEND_FACT_SHIFT)
100
101static const struct formatinfo {
102	unsigned int fmt, xfmt;
103	uint32_t card_fmt;
104	bool rb_reversed;
105} gen3_tex_formats[] = {
106	{PICT_a8, 0, MAPSURF_8BIT | MT_8BIT_A8, false},
107	{PICT_a8r8g8b8, 0, MAPSURF_32BIT | MT_32BIT_ARGB8888, false},
108	{PICT_x8r8g8b8, 0, MAPSURF_32BIT | MT_32BIT_XRGB8888, false},
109	{PICT_a8b8g8r8, 0, MAPSURF_32BIT | MT_32BIT_ABGR8888, false},
110	{PICT_x8b8g8r8, 0, MAPSURF_32BIT | MT_32BIT_XBGR8888, false},
111#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
112	{PICT_a2r10g10b10, PICT_x2r10g10b10, MAPSURF_32BIT | MT_32BIT_ARGB2101010, false},
113	{PICT_a2b10g10r10, PICT_x2b10g10r10, MAPSURF_32BIT | MT_32BIT_ABGR2101010, false},
114#endif
115	{PICT_r5g6b5, 0, MAPSURF_16BIT | MT_16BIT_RGB565, false},
116	{PICT_b5g6r5, 0, MAPSURF_16BIT | MT_16BIT_RGB565, true},
117	{PICT_a1r5g5b5, PICT_x1r5g5b5, MAPSURF_16BIT | MT_16BIT_ARGB1555, false},
118	{PICT_a1b5g5r5, PICT_x1b5g5r5, MAPSURF_16BIT | MT_16BIT_ARGB1555, true},
119	{PICT_a4r4g4b4, PICT_x4r4g4b4, MAPSURF_16BIT | MT_16BIT_ARGB4444, false},
120	{PICT_a4b4g4r4, PICT_x4b4g4r4, MAPSURF_16BIT | MT_16BIT_ARGB4444, true},
121};
122
123#define xFixedToDouble(f) pixman_fixed_to_double(f)
124
125static inline bool too_large(int width, int height)
126{
127	return width > MAX_3D_SIZE || height > MAX_3D_SIZE;
128}
129
130static inline uint32_t gen3_buf_tiling(uint32_t tiling)
131{
132	uint32_t v = 0;
133	switch (tiling) {
134	case I915_TILING_Y: v |= BUF_3D_TILE_WALK_Y;
135	case I915_TILING_X: v |= BUF_3D_TILED_SURFACE;
136	case I915_TILING_NONE: break;
137	}
138	return v;
139}
140
141static inline bool
142gen3_check_pitch_3d(struct kgem_bo *bo)
143{
144	return bo->pitch <= MAX_3D_PITCH;
145}
146
147static uint32_t gen3_get_blend_cntl(int op,
148				    bool has_component_alpha,
149				    uint32_t dst_format)
150{
151	uint32_t sblend = gen3_blend_op[op].src_blend;
152	uint32_t dblend = gen3_blend_op[op].dst_blend;
153
154	if (op <= PictOpSrc) /* for clear and src disable blending */
155		return S6_COLOR_WRITE_ONLY;
156
157	/* If there's no dst alpha channel, adjust the blend op so that we'll
158	 * treat it as always 1.
159	 */
160	if (gen3_blend_op[op].dst_alpha) {
161		if (PICT_FORMAT_A(dst_format) == 0) {
162			if (sblend == BLENDFACT_DST_ALPHA)
163				sblend = BLENDFACT_ONE;
164			else if (sblend == BLENDFACT_INV_DST_ALPHA)
165				sblend = BLENDFACT_ZERO;
166		}
167
168		/* gen3 engine reads 8bit color buffer into green channel
169		 * in cases like color buffer blending etc., and also writes
170		 * back green channel.  So with dst_alpha blend we should use
171		 * color factor. See spec on "8-bit rendering".
172		 */
173		if (dst_format == PICT_a8) {
174			if (sblend == BLENDFACT_DST_ALPHA)
175				sblend = BLENDFACT_DST_COLR;
176			else if (sblend == BLENDFACT_INV_DST_ALPHA)
177				sblend = BLENDFACT_INV_DST_COLR;
178		}
179	}
180
181	/* If the source alpha is being used, then we should only be in a case
182	 * where the source blend factor is 0, and the source blend value is the
183	 * mask channels multiplied by the source picture's alpha.
184	 */
185	if (has_component_alpha && gen3_blend_op[op].src_alpha) {
186		if (dblend == BLENDFACT_SRC_ALPHA)
187			dblend = BLENDFACT_SRC_COLR;
188		else if (dblend == BLENDFACT_INV_SRC_ALPHA)
189			dblend = BLENDFACT_INV_SRC_COLR;
190	}
191
192	return (S6_CBUF_BLEND_ENABLE | S6_COLOR_WRITE_ENABLE |
193		BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT |
194		sblend << S6_CBUF_SRC_BLEND_FACT_SHIFT |
195		dblend << S6_CBUF_DST_BLEND_FACT_SHIFT);
196}
197
198static bool gen3_check_dst_format(uint32_t format)
199{
200	switch (format) {
201	case PICT_a8r8g8b8:
202	case PICT_x8r8g8b8:
203	case PICT_a8b8g8r8:
204	case PICT_x8b8g8r8:
205	case PICT_r5g6b5:
206	case PICT_b5g6r5:
207	case PICT_a1r5g5b5:
208	case PICT_x1r5g5b5:
209	case PICT_a1b5g5r5:
210	case PICT_x1b5g5r5:
211#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
212	case PICT_a2r10g10b10:
213	case PICT_x2r10g10b10:
214	case PICT_a2b10g10r10:
215	case PICT_x2b10g10r10:
216#endif
217	case PICT_a8:
218	case PICT_a4r4g4b4:
219	case PICT_x4r4g4b4:
220	case PICT_a4b4g4r4:
221	case PICT_x4b4g4r4:
222		return true;
223	default:
224		return false;
225	}
226}
227
228static bool gen3_dst_rb_reversed(uint32_t format)
229{
230	switch (format) {
231	case PICT_a8r8g8b8:
232	case PICT_x8r8g8b8:
233	case PICT_r5g6b5:
234	case PICT_a1r5g5b5:
235	case PICT_x1r5g5b5:
236#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
237	case PICT_a2r10g10b10:
238	case PICT_x2r10g10b10:
239#endif
240	case PICT_a8:
241	case PICT_a4r4g4b4:
242	case PICT_x4r4g4b4:
243		return false;
244	default:
245		return true;
246	}
247}
248
249#define DSTORG_HORT_BIAS(x)             ((x)<<20)
250#define DSTORG_VERT_BIAS(x)             ((x)<<16)
251
252static uint32_t gen3_get_dst_format(uint32_t format)
253{
254#define BIAS (DSTORG_HORT_BIAS(0x8) | DSTORG_VERT_BIAS(0x8))
255	switch (format) {
256	default:
257	case PICT_a8r8g8b8:
258	case PICT_x8r8g8b8:
259	case PICT_a8b8g8r8:
260	case PICT_x8b8g8r8:
261		return BIAS | COLR_BUF_ARGB8888;
262	case PICT_r5g6b5:
263	case PICT_b5g6r5:
264		return BIAS | COLR_BUF_RGB565;
265	case PICT_a1r5g5b5:
266	case PICT_x1r5g5b5:
267	case PICT_a1b5g5r5:
268	case PICT_x1b5g5r5:
269		return BIAS | COLR_BUF_ARGB1555;
270#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
271	case PICT_a2r10g10b10:
272	case PICT_x2r10g10b10:
273	case PICT_a2b10g10r10:
274	case PICT_x2b10g10r10:
275		return BIAS | COLR_BUF_ARGB2AAA;
276#endif
277	case PICT_a8:
278		return BIAS | COLR_BUF_8BIT;
279	case PICT_a4r4g4b4:
280	case PICT_x4r4g4b4:
281	case PICT_a4b4g4r4:
282	case PICT_x4b4g4r4:
283		return BIAS | COLR_BUF_ARGB4444;
284	}
285#undef BIAS
286}
287
288static bool gen3_check_format(PicturePtr p)
289{
290	switch (p->format) {
291	case PICT_a8:
292	case PICT_a8r8g8b8:
293	case PICT_x8r8g8b8:
294	case PICT_a8b8g8r8:
295	case PICT_x8b8g8r8:
296#ifdef PICT_a2r10g10b10
297	case PICT_a2r10g10b10:
298	case PICT_a2b10g10r10:
299#endif
300	case PICT_r5g6b5:
301	case PICT_b5g6r5:
302	case PICT_a1r5g5b5:
303	case PICT_a1b5g5r5:
304	case PICT_a4r4g4b4:
305	case PICT_a4b4g4r4:
306		return true;
307	default:
308		return false;
309	}
310}
311
312static bool gen3_check_xformat(PicturePtr p)
313{
314	switch (p->format) {
315	case PICT_a8r8g8b8:
316	case PICT_x8r8g8b8:
317	case PICT_a8b8g8r8:
318	case PICT_x8b8g8r8:
319	case PICT_r5g6b5:
320	case PICT_b5g6r5:
321	case PICT_a1r5g5b5:
322	case PICT_x1r5g5b5:
323	case PICT_a1b5g5r5:
324	case PICT_x1b5g5r5:
325#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
326	case PICT_a2r10g10b10:
327	case PICT_x2r10g10b10:
328	case PICT_a2b10g10r10:
329	case PICT_x2b10g10r10:
330#endif
331	case PICT_a8:
332	case PICT_a4r4g4b4:
333	case PICT_x4r4g4b4:
334	case PICT_a4b4g4r4:
335	case PICT_x4b4g4r4:
336		return true;
337	default:
338		return false;
339	}
340}
341
342static uint32_t gen3_texture_repeat(uint32_t repeat)
343{
344#define REPEAT(x) \
345	(SS3_NORMALIZED_COORDS | \
346	 TEXCOORDMODE_##x << SS3_TCX_ADDR_MODE_SHIFT | \
347	 TEXCOORDMODE_##x << SS3_TCY_ADDR_MODE_SHIFT)
348	switch (repeat) {
349	default:
350	case RepeatNone:
351		return REPEAT(CLAMP_BORDER);
352	case RepeatNormal:
353		return REPEAT(WRAP);
354	case RepeatPad:
355		return REPEAT(CLAMP_EDGE);
356	case RepeatReflect:
357		return REPEAT(MIRROR);
358	}
359#undef REPEAT
360}
361
362static uint32_t gen3_gradient_repeat(uint32_t repeat)
363{
364#define REPEAT(x) \
365	(SS3_NORMALIZED_COORDS | \
366	 TEXCOORDMODE_##x  << SS3_TCX_ADDR_MODE_SHIFT | \
367	 TEXCOORDMODE_WRAP << SS3_TCY_ADDR_MODE_SHIFT)
368	switch (repeat) {
369	default:
370	case RepeatNone:
371		return REPEAT(CLAMP_BORDER);
372	case RepeatNormal:
373		return REPEAT(WRAP);
374	case RepeatPad:
375		return REPEAT(CLAMP_EDGE);
376	case RepeatReflect:
377		return REPEAT(MIRROR);
378	}
379#undef REPEAT
380}
381
382static bool gen3_check_repeat(PicturePtr p)
383{
384	if (!p->repeat)
385		return true;
386
387	switch (p->repeatType) {
388	case RepeatNone:
389	case RepeatNormal:
390	case RepeatPad:
391	case RepeatReflect:
392		return true;
393	default:
394		return false;
395	}
396}
397
398static uint32_t gen3_filter(uint32_t filter)
399{
400	switch (filter) {
401	default:
402		assert(0);
403	case PictFilterNearest:
404		return (FILTER_NEAREST << SS2_MAG_FILTER_SHIFT |
405			FILTER_NEAREST << SS2_MIN_FILTER_SHIFT |
406			MIPFILTER_NONE << SS2_MIP_FILTER_SHIFT);
407	case PictFilterBilinear:
408		return (FILTER_LINEAR  << SS2_MAG_FILTER_SHIFT |
409			FILTER_LINEAR  << SS2_MIN_FILTER_SHIFT |
410			MIPFILTER_NONE << SS2_MIP_FILTER_SHIFT);
411	}
412}
413
414static bool gen3_check_filter(PicturePtr p)
415{
416	switch (p->filter) {
417	case PictFilterNearest:
418	case PictFilterBilinear:
419		return true;
420	default:
421		return false;
422	}
423}
424
425static inline void
426gen3_emit_composite_dstcoord(struct sna *sna, int16_t dstX, int16_t dstY)
427{
428	OUT_VERTEX(dstX);
429	OUT_VERTEX(dstY);
430}
431
432fastcall static void
433gen3_emit_composite_primitive_constant(struct sna *sna,
434				       const struct sna_composite_op *op,
435				       const struct sna_composite_rectangles *r)
436{
437	int16_t dst_x = r->dst.x + op->dst.x;
438	int16_t dst_y = r->dst.y + op->dst.y;
439
440	gen3_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
441	gen3_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
442	gen3_emit_composite_dstcoord(sna, dst_x, dst_y);
443}
444
445fastcall static void
446gen3_emit_composite_boxes_constant(const struct sna_composite_op *op,
447				   const BoxRec *box, int nbox,
448				   float *v)
449{
450	do {
451		v[0] = box->x2 + op->dst.x;
452		v[1] = box->y2 + op->dst.y;
453
454		v[2] = box->x1 + op->dst.x;
455		v[3] = box->y2 + op->dst.y;
456
457		v[4] = box->x1 + op->dst.x;
458		v[5] = box->y1 + op->dst.y;
459
460		box++;
461		v += 6;
462	} while (--nbox);
463}
464
465fastcall static void
466gen3_emit_composite_primitive_identity_gradient(struct sna *sna,
467						const struct sna_composite_op *op,
468						const struct sna_composite_rectangles *r)
469{
470	int16_t dst_x, dst_y;
471	int16_t src_x, src_y;
472
473	dst_x = r->dst.x + op->dst.x;
474	dst_y = r->dst.y + op->dst.y;
475	src_x = r->src.x + op->src.offset[0];
476	src_y = r->src.y + op->src.offset[1];
477
478	gen3_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
479	OUT_VERTEX(src_x + r->width);
480	OUT_VERTEX(src_y + r->height);
481
482	gen3_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
483	OUT_VERTEX(src_x);
484	OUT_VERTEX(src_y + r->height);
485
486	gen3_emit_composite_dstcoord(sna, dst_x, dst_y);
487	OUT_VERTEX(src_x);
488	OUT_VERTEX(src_y);
489}
490
491fastcall static void
492gen3_emit_composite_boxes_identity_gradient(const struct sna_composite_op *op,
493					    const BoxRec *box, int nbox,
494					    float *v)
495{
496	do {
497		v[0] = box->x2 + op->dst.x;
498		v[1] = box->y2 + op->dst.y;
499		v[2] = box->x2 + op->src.offset[0];
500		v[3] = box->y2 + op->src.offset[1];
501
502		v[4] = box->x1 + op->dst.x;
503		v[5] = box->y2 + op->dst.y;
504		v[6] = box->x1 + op->src.offset[0];
505		v[7] = box->y2 + op->src.offset[1];
506
507		v[8] = box->x1 + op->dst.x;
508		v[9] = box->y1 + op->dst.y;
509		v[10] = box->x1 + op->src.offset[0];
510		v[11] = box->y1 + op->src.offset[1];
511
512		v += 12;
513		box++;
514	} while (--nbox);
515}
516
517fastcall static void
518gen3_emit_composite_primitive_affine_gradient(struct sna *sna,
519					      const struct sna_composite_op *op,
520					      const struct sna_composite_rectangles *r)
521{
522	PictTransform *transform = op->src.transform;
523	int16_t dst_x, dst_y;
524	int16_t src_x, src_y;
525	float *v;
526
527	dst_x = r->dst.x + op->dst.x;
528	dst_y = r->dst.y + op->dst.y;
529	src_x = r->src.x + op->src.offset[0];
530	src_y = r->src.y + op->src.offset[1];
531
532	v = sna->render.vertices + sna->render.vertex_used;
533	sna->render.vertex_used += 12;
534	assert(sna->render.vertex_used <= sna->render.vertex_size);
535
536	v[0] = dst_x + r->width;
537	v[1] = dst_y + r->height;
538	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
539				    transform, op->src.scale,
540				    &v[2], &v[3]);
541
542	v[4] = dst_x;
543	v[5] = dst_y + r->height;
544	_sna_get_transformed_scaled(src_x, src_y + r->height,
545				    transform, op->src.scale,
546				    &v[6], &v[7]);
547
548	v[8] = dst_x;
549	v[9] = dst_y;
550	_sna_get_transformed_scaled(src_x, src_y,
551				    transform, op->src.scale,
552				    &v[10], &v[11]);
553}
554
555fastcall static void
556gen3_emit_composite_boxes_affine_gradient(const struct sna_composite_op *op,
557					  const BoxRec *box, int nbox,
558					  float *v)
559{
560	const PictTransform *transform = op->src.transform;
561
562	do {
563		v[0] = box->x2 + op->dst.x;
564		v[1] = box->y2 + op->dst.y;
565		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
566					    box->y2 + op->src.offset[1],
567					    transform, op->src.scale,
568					    &v[2], &v[3]);
569
570		v[4] = box->x1 + op->dst.x;
571		v[5] = box->y2 + op->dst.y;
572		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
573					    box->y2 + op->src.offset[1],
574					    transform, op->src.scale,
575					    &v[6], &v[7]);
576
577		v[8] = box->x1 + op->dst.x;
578		v[9] = box->y1 + op->dst.y;
579		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
580					    box->y1 + op->src.offset[1],
581					    transform, op->src.scale,
582					    &v[10], &v[11]);
583
584		box++;
585		v += 12;
586	} while (--nbox);
587}
588
589fastcall static void
590gen3_emit_composite_primitive_identity_source(struct sna *sna,
591					      const struct sna_composite_op *op,
592					      const struct sna_composite_rectangles *r)
593{
594	float w = r->width;
595	float h = r->height;
596	float *v;
597
598	v = sna->render.vertices + sna->render.vertex_used;
599	sna->render.vertex_used += 12;
600	assert(sna->render.vertex_used <= sna->render.vertex_size);
601
602	v[8] = v[4] = r->dst.x + op->dst.x;
603	v[0] = v[4] + w;
604
605	v[9] = r->dst.y + op->dst.y;
606	v[5] = v[1] = v[9] + h;
607
608	v[10] = v[6] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
609	v[2] = v[6] + w * op->src.scale[0];
610
611	v[11] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
612	v[7] = v[3] = v[11] + h * op->src.scale[1];
613}
614
615fastcall static void
616gen3_emit_composite_boxes_identity_source(const struct sna_composite_op *op,
617					  const BoxRec *box, int nbox,
618					  float *v)
619{
620	do {
621		v[0] = box->x2 + op->dst.x;
622		v[8] = v[4] = box->x1 + op->dst.x;
623		v[5] = v[1] = box->y2 + op->dst.y;
624		v[9] = box->y1 + op->dst.y;
625
626		v[10] = v[6] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
627		v[2] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
628
629		v[11] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
630		v[7] = v[3] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
631
632		v += 12;
633		box++;
634	} while (--nbox);
635}
636
637fastcall static void
638gen3_emit_composite_primitive_identity_source_no_offset(struct sna *sna,
639							const struct sna_composite_op *op,
640							const struct sna_composite_rectangles *r)
641{
642	float w = r->width;
643	float h = r->height;
644	float *v;
645
646	v = sna->render.vertices + sna->render.vertex_used;
647	sna->render.vertex_used += 12;
648	assert(sna->render.vertex_used <= sna->render.vertex_size);
649
650	v[8] = v[4] = r->dst.x;
651	v[9] = r->dst.y;
652
653	v[0] = v[4] + w;
654	v[5] = v[1] = v[9] + h;
655
656	v[10] = v[6] = r->src.x * op->src.scale[0];
657	v[11] = r->src.y * op->src.scale[1];
658
659	v[2] = v[6] + w * op->src.scale[0];
660	v[7] = v[3] = v[11] + h * op->src.scale[1];
661}
662
663fastcall static void
664gen3_emit_composite_boxes_identity_source_no_offset(const struct sna_composite_op *op,
665						    const BoxRec *box, int nbox,
666						    float *v)
667{
668	do {
669		v[0] = box->x2;
670		v[8] = v[4] = box->x1;
671		v[5] = v[1] = box->y2;
672		v[9] = box->y1;
673
674		v[10] = v[6] = box->x1 * op->src.scale[0];
675		v[2] = box->x2 * op->src.scale[0];
676
677		v[11] = box->y1 * op->src.scale[1];
678		v[7] = v[3] = box->y2 * op->src.scale[1];
679
680		v += 12;
681		box++;
682	} while (--nbox);
683}
684
685fastcall static void
686gen3_emit_composite_primitive_affine_source(struct sna *sna,
687					    const struct sna_composite_op *op,
688					    const struct sna_composite_rectangles *r)
689{
690	PictTransform *transform = op->src.transform;
691	int16_t dst_x = r->dst.x + op->dst.x;
692	int16_t dst_y = r->dst.y + op->dst.y;
693	int src_x = r->src.x + (int)op->src.offset[0];
694	int src_y = r->src.y + (int)op->src.offset[1];
695	float *v;
696
697	v = sna->render.vertices + sna->render.vertex_used;
698	sna->render.vertex_used += 12;
699	assert(sna->render.vertex_used <= sna->render.vertex_size);
700
701	v[0] = dst_x + r->width;
702	v[5] = v[1] = dst_y + r->height;
703	v[8] = v[4] = dst_x;
704	v[9] = dst_y;
705
706	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
707				    transform, op->src.scale,
708				    &v[2], &v[3]);
709
710	_sna_get_transformed_scaled(src_x, src_y + r->height,
711				    transform, op->src.scale,
712				    &v[6], &v[7]);
713
714	_sna_get_transformed_scaled(src_x, src_y,
715				    transform, op->src.scale,
716				    &v[10], &v[11]);
717}
718
719fastcall static void
720gen3_emit_composite_boxes_affine_source(const struct sna_composite_op *op,
721					const BoxRec *box, int nbox,
722					float *v)
723{
724	const PictTransform *transform = op->src.transform;
725
726	do {
727		v[0] = box->x2 + op->dst.x;
728		v[5] = v[1] = box->y2 + op->dst.y;
729		v[8] = v[4] = box->x1 + op->dst.x;
730		v[9] = box->y1 + op->dst.y;
731
732		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
733					    box->y2 + op->src.offset[1],
734					    transform, op->src.scale,
735					    &v[2], &v[3]);
736
737		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
738					    box->y2 + op->src.offset[1],
739					    transform, op->src.scale,
740					    &v[6], &v[7]);
741
742		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
743					    box->y1 + op->src.offset[1],
744					    transform, op->src.scale,
745					    &v[10], &v[11]);
746
747		v += 12;
748		box++;
749	} while (--nbox);
750}
751
752fastcall static void
753gen3_emit_composite_primitive_constant_identity_mask(struct sna *sna,
754						     const struct sna_composite_op *op,
755						     const struct sna_composite_rectangles *r)
756{
757	float w = r->width;
758	float h = r->height;
759	float *v;
760
761	v = sna->render.vertices + sna->render.vertex_used;
762	sna->render.vertex_used += 12;
763	assert(sna->render.vertex_used <= sna->render.vertex_size);
764
765	v[8] = v[4] = r->dst.x + op->dst.x;
766	v[0] = v[4] + w;
767
768	v[9] = r->dst.y + op->dst.y;
769	v[5] = v[1] = v[9] + h;
770
771	v[10] = v[6] = (r->mask.x + op->mask.offset[0]) * op->mask.scale[0];
772	v[2] = v[6] + w * op->mask.scale[0];
773
774	v[11] = (r->mask.y + op->mask.offset[1]) * op->mask.scale[1];
775	v[7] = v[3] = v[11] + h * op->mask.scale[1];
776}
777
778fastcall static void
779gen3_emit_composite_primitive_constant_identity_mask_no_offset(struct sna *sna,
780							       const struct sna_composite_op *op,
781							       const struct sna_composite_rectangles *r)
782{
783	float w = r->width;
784	float h = r->height;
785	float *v;
786
787	v = sna->render.vertices + sna->render.vertex_used;
788	sna->render.vertex_used += 12;
789	assert(sna->render.vertex_used <= sna->render.vertex_size);
790
791	v[8] = v[4] = r->dst.x;
792	v[9] = r->dst.y;
793
794	v[0] = v[4] + w;
795	v[5] = v[1] = v[9] + h;
796
797	v[10] = v[6] = r->mask.x * op->mask.scale[0];
798	v[11] = r->mask.y * op->mask.scale[1];
799
800	v[2] = v[6] + w * op->mask.scale[0];
801	v[7] = v[3] = v[11] + h * op->mask.scale[1];
802}
803
804fastcall static void
805gen3_emit_composite_primitive_identity_source_mask(struct sna *sna,
806						   const struct sna_composite_op *op,
807						   const struct sna_composite_rectangles *r)
808{
809	float dst_x, dst_y;
810	float src_x, src_y;
811	float msk_x, msk_y;
812	float w, h;
813	float *v;
814
815	dst_x = r->dst.x + op->dst.x;
816	dst_y = r->dst.y + op->dst.y;
817	src_x = r->src.x + op->src.offset[0];
818	src_y = r->src.y + op->src.offset[1];
819	msk_x = r->mask.x + op->mask.offset[0];
820	msk_y = r->mask.y + op->mask.offset[1];
821	w = r->width;
822	h = r->height;
823
824	v = sna->render.vertices + sna->render.vertex_used;
825	sna->render.vertex_used += 18;
826	assert(sna->render.vertex_used <= sna->render.vertex_size);
827
828	v[0] = dst_x + w;
829	v[1] = dst_y + h;
830	v[2] = (src_x + w) * op->src.scale[0];
831	v[3] = (src_y + h) * op->src.scale[1];
832	v[4] = (msk_x + w) * op->mask.scale[0];
833	v[5] = (msk_y + h) * op->mask.scale[1];
834
835	v[6] = dst_x;
836	v[7] = v[1];
837	v[8] = src_x * op->src.scale[0];
838	v[9] = v[3];
839	v[10] = msk_x * op->mask.scale[0];
840	v[11] =v[5];
841
842	v[12] = v[6];
843	v[13] = dst_y;
844	v[14] = v[8];
845	v[15] = src_y * op->src.scale[1];
846	v[16] = v[10];
847	v[17] = msk_y * op->mask.scale[1];
848}
849
850fastcall static void
851gen3_emit_composite_primitive_affine_source_mask(struct sna *sna,
852						 const struct sna_composite_op *op,
853						 const struct sna_composite_rectangles *r)
854{
855	int16_t src_x, src_y;
856	float dst_x, dst_y;
857	float msk_x, msk_y;
858	float w, h;
859	float *v;
860
861	dst_x = r->dst.x + op->dst.x;
862	dst_y = r->dst.y + op->dst.y;
863	src_x = r->src.x + op->src.offset[0];
864	src_y = r->src.y + op->src.offset[1];
865	msk_x = r->mask.x + op->mask.offset[0];
866	msk_y = r->mask.y + op->mask.offset[1];
867	w = r->width;
868	h = r->height;
869
870	v = sna->render.vertices + sna->render.vertex_used;
871	sna->render.vertex_used += 18;
872	assert(sna->render.vertex_used <= sna->render.vertex_size);
873
874	v[0] = dst_x + w;
875	v[1] = dst_y + h;
876	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
877				    op->src.transform, op->src.scale,
878				    &v[2], &v[3]);
879	v[4] = (msk_x + w) * op->mask.scale[0];
880	v[5] = (msk_y + h) * op->mask.scale[1];
881
882	v[6] = dst_x;
883	v[7] = v[1];
884	_sna_get_transformed_scaled(src_x, src_y + r->height,
885				    op->src.transform, op->src.scale,
886				    &v[8], &v[9]);
887	v[10] = msk_x * op->mask.scale[0];
888	v[11] =v[5];
889
890	v[12] = v[6];
891	v[13] = dst_y;
892	_sna_get_transformed_scaled(src_x, src_y,
893				    op->src.transform, op->src.scale,
894				    &v[14], &v[15]);
895	v[16] = v[10];
896	v[17] = msk_y * op->mask.scale[1];
897}
898
899static void
900gen3_emit_composite_texcoord(struct sna *sna,
901			     const struct sna_composite_channel *channel,
902			     int16_t x, int16_t y)
903{
904	float s = 0, t = 0, w = 1;
905
906	switch (channel->u.gen3.type) {
907	case SHADER_OPACITY:
908	case SHADER_NONE:
909	case SHADER_ZERO:
910	case SHADER_BLACK:
911	case SHADER_WHITE:
912	case SHADER_CONSTANT:
913		break;
914
915	case SHADER_LINEAR:
916	case SHADER_RADIAL:
917	case SHADER_TEXTURE:
918		x += channel->offset[0];
919		y += channel->offset[1];
920		if (channel->is_affine) {
921			sna_get_transformed_coordinates(x, y,
922							channel->transform,
923							&s, &t);
924			OUT_VERTEX(s * channel->scale[0]);
925			OUT_VERTEX(t * channel->scale[1]);
926		} else {
927			sna_get_transformed_coordinates_3d(x, y,
928							   channel->transform,
929							   &s, &t, &w);
930			OUT_VERTEX(s * channel->scale[0]);
931			OUT_VERTEX(t * channel->scale[1]);
932			OUT_VERTEX(0);
933			OUT_VERTEX(w);
934		}
935		break;
936	}
937}
938
939static void
940gen3_emit_composite_vertex(struct sna *sna,
941			   const struct sna_composite_op *op,
942			   int16_t srcX, int16_t srcY,
943			   int16_t maskX, int16_t maskY,
944			   int16_t dstX, int16_t dstY)
945{
946	gen3_emit_composite_dstcoord(sna, dstX, dstY);
947	gen3_emit_composite_texcoord(sna, &op->src, srcX, srcY);
948	gen3_emit_composite_texcoord(sna, &op->mask, maskX, maskY);
949}
950
951fastcall static void
952gen3_emit_composite_primitive(struct sna *sna,
953			      const struct sna_composite_op *op,
954			      const struct sna_composite_rectangles *r)
955{
956	gen3_emit_composite_vertex(sna, op,
957				   r->src.x + r->width,
958				   r->src.y + r->height,
959				   r->mask.x + r->width,
960				   r->mask.y + r->height,
961				   op->dst.x + r->dst.x + r->width,
962				   op->dst.y + r->dst.y + r->height);
963	gen3_emit_composite_vertex(sna, op,
964				   r->src.x,
965				   r->src.y + r->height,
966				   r->mask.x,
967				   r->mask.y + r->height,
968				   op->dst.x + r->dst.x,
969				   op->dst.y + r->dst.y + r->height);
970	gen3_emit_composite_vertex(sna, op,
971				   r->src.x,
972				   r->src.y,
973				   r->mask.x,
974				   r->mask.y,
975				   op->dst.x + r->dst.x,
976				   op->dst.y + r->dst.y);
977}
978
979#if defined(sse2) && !defined(__x86_64__)
980sse2 fastcall static void
981gen3_emit_composite_primitive_constant__sse2(struct sna *sna,
982					     const struct sna_composite_op *op,
983					     const struct sna_composite_rectangles *r)
984{
985	float *v;
986
987	v = sna->render.vertices + sna->render.vertex_used;
988	sna->render.vertex_used += 6;
989	assert(sna->render.vertex_used <= sna->render.vertex_size);
990
991	v[4] = v[2] = r->dst.x + op->dst.x;
992	v[5] = r->dst.y + op->dst.y;
993
994	v[0] = v[2] + r->width;
995	v[3] = v[1] = v[5] + r->height;
996
997}
998
999sse2 fastcall static void
1000gen3_emit_composite_boxes_constant__sse2(const struct sna_composite_op *op,
1001					 const BoxRec *box, int nbox,
1002					 float *v)
1003{
1004	do {
1005		v[0] = box->x2 + op->dst.x;
1006		v[3] = v[1] = box->y2 + op->dst.y;
1007		v[4] = v[2] = box->x1 + op->dst.x;
1008		v[5] = box->y1 + op->dst.y;
1009
1010		box++;
1011		v += 6;
1012	} while (--nbox);
1013}
1014
1015sse2 fastcall static void
1016gen3_emit_composite_primitive_identity_gradient__sse2(struct sna *sna,
1017						      const struct sna_composite_op *op,
1018						      const struct sna_composite_rectangles *r)
1019{
1020	int16_t x, y;
1021	float *v;
1022
1023	v = sna->render.vertices + sna->render.vertex_used;
1024	sna->render.vertex_used += 12;
1025	assert(sna->render.vertex_used <= sna->render.vertex_size);
1026
1027	x = r->dst.x + op->dst.x;
1028	y = r->dst.y + op->dst.y;
1029	v[0] = x + r->width;
1030	v[5] = v[1] = y + r->height;
1031	v[8] = v[4] = x;
1032	v[9] = y;
1033
1034	x = r->src.x + op->src.offset[0];
1035	y = r->src.y + op->src.offset[1];
1036	v[2] = x + r->width;
1037	v[7] = v[3] = y + r->height;
1038	v[10] = v[6] = x;
1039	v[11] = y;
1040}
1041
1042sse2 fastcall static void
1043gen3_emit_composite_boxes_identity_gradient__sse2(const struct sna_composite_op *op,
1044						  const BoxRec *box, int nbox,
1045						  float *v)
1046{
1047	do {
1048		v[0] = box->x2 + op->dst.x;
1049		v[5] = v[1] = box->y2 + op->dst.y;
1050		v[8] = v[4] = box->x1 + op->dst.x;
1051		v[9] = box->y1 + op->dst.y;
1052
1053		v[2] = box->x2 + op->src.offset[0];
1054		v[7] = v[3] = box->y2 + op->src.offset[1];
1055		v[10] = v[6] = box->x1 + op->src.offset[0];
1056		v[11] = box->y1 + op->src.offset[1];
1057
1058		v += 12;
1059		box++;
1060	} while (--nbox);
1061}
1062
1063sse2 fastcall static void
1064gen3_emit_composite_primitive_affine_gradient__sse2(struct sna *sna,
1065						    const struct sna_composite_op *op,
1066						    const struct sna_composite_rectangles *r)
1067{
1068	PictTransform *transform = op->src.transform;
1069	int16_t dst_x, dst_y;
1070	int16_t src_x, src_y;
1071	float *v;
1072
1073	dst_x = r->dst.x + op->dst.x;
1074	dst_y = r->dst.y + op->dst.y;
1075	src_x = r->src.x + op->src.offset[0];
1076	src_y = r->src.y + op->src.offset[1];
1077
1078	v = sna->render.vertices + sna->render.vertex_used;
1079	sna->render.vertex_used += 12;
1080	assert(sna->render.vertex_used <= sna->render.vertex_size);
1081
1082	v[0] = dst_x + r->width;
1083	v[1] = dst_y + r->height;
1084	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
1085				    transform, op->src.scale,
1086				    &v[2], &v[3]);
1087
1088	v[4] = dst_x;
1089	v[5] = dst_y + r->height;
1090	_sna_get_transformed_scaled(src_x, src_y + r->height,
1091				    transform, op->src.scale,
1092				    &v[6], &v[7]);
1093
1094	v[8] = dst_x;
1095	v[9] = dst_y;
1096	_sna_get_transformed_scaled(src_x, src_y,
1097				    transform, op->src.scale,
1098				    &v[10], &v[11]);
1099}
1100
1101sse2 fastcall static void
1102gen3_emit_composite_boxes_affine_gradient__sse2(const struct sna_composite_op *op,
1103						const BoxRec *box, int nbox,
1104						float *v)
1105{
1106	const PictTransform *transform = op->src.transform;
1107
1108	do {
1109		v[0] = box->x2 + op->dst.x;
1110		v[1] = box->y2 + op->dst.y;
1111		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
1112					    box->y2 + op->src.offset[1],
1113					    transform, op->src.scale,
1114					    &v[2], &v[3]);
1115
1116		v[4] = box->x1 + op->dst.x;
1117		v[5] = box->y2 + op->dst.y;
1118		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
1119					    box->y2 + op->src.offset[1],
1120					    transform, op->src.scale,
1121					    &v[6], &v[7]);
1122
1123		v[8] = box->x1 + op->dst.x;
1124		v[9] = box->y1 + op->dst.y;
1125		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
1126					    box->y1 + op->src.offset[1],
1127					    transform, op->src.scale,
1128					    &v[10], &v[11]);
1129
1130		box++;
1131		v += 12;
1132	} while (--nbox);
1133}
1134
1135sse2 fastcall static void
1136gen3_emit_composite_primitive_identity_source__sse2(struct sna *sna,
1137						    const struct sna_composite_op *op,
1138						    const struct sna_composite_rectangles *r)
1139{
1140	float w = r->width;
1141	float h = r->height;
1142	float *v;
1143
1144	v = sna->render.vertices + sna->render.vertex_used;
1145	sna->render.vertex_used += 12;
1146	assert(sna->render.vertex_used <= sna->render.vertex_size);
1147
1148	v[8] = v[4] = r->dst.x + op->dst.x;
1149	v[0] = v[4] + w;
1150
1151	v[9] = r->dst.y + op->dst.y;
1152	v[5] = v[1] = v[9] + h;
1153
1154	v[10] = v[6] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
1155	v[2] = v[6] + w * op->src.scale[0];
1156
1157	v[11] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
1158	v[7] = v[3] = v[11] + h * op->src.scale[1];
1159}
1160
1161sse2 fastcall static void
1162gen3_emit_composite_boxes_identity_source__sse2(const struct sna_composite_op *op,
1163						const BoxRec *box, int nbox,
1164						float *v)
1165{
1166	do {
1167		v[0] = box->x2 + op->dst.x;
1168		v[8] = v[4] = box->x1 + op->dst.x;
1169		v[5] = v[1] = box->y2 + op->dst.y;
1170		v[9] = box->y1 + op->dst.y;
1171
1172		v[10] = v[6] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
1173		v[2] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
1174
1175		v[11] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
1176		v[7] = v[3] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
1177
1178		v += 12;
1179		box++;
1180	} while (--nbox);
1181}
1182
1183sse2 fastcall static void
1184gen3_emit_composite_primitive_identity_source_no_offset__sse2(struct sna *sna,
1185							      const struct sna_composite_op *op,
1186							      const struct sna_composite_rectangles *r)
1187{
1188	float w = r->width;
1189	float h = r->height;
1190	float *v;
1191
1192	v = sna->render.vertices + sna->render.vertex_used;
1193	sna->render.vertex_used += 12;
1194	assert(sna->render.vertex_used <= sna->render.vertex_size);
1195
1196	v[8] = v[4] = r->dst.x;
1197	v[9] = r->dst.y;
1198
1199	v[0] = v[4] + w;
1200	v[5] = v[1] = v[9] + h;
1201
1202	v[10] = v[6] = r->src.x * op->src.scale[0];
1203	v[11] = r->src.y * op->src.scale[1];
1204
1205	v[2] = v[6] + w * op->src.scale[0];
1206	v[7] = v[3] = v[11] + h * op->src.scale[1];
1207}
1208
1209sse2 fastcall static void
1210gen3_emit_composite_boxes_identity_source_no_offset__sse2(const struct sna_composite_op *op,
1211							  const BoxRec *box, int nbox,
1212							  float *v)
1213{
1214	do {
1215		v[0] = box->x2;
1216		v[8] = v[4] = box->x1;
1217		v[5] = v[1] = box->y2;
1218		v[9] = box->y1;
1219
1220		v[10] = v[6] = box->x1 * op->src.scale[0];
1221		v[2] = box->x2 * op->src.scale[0];
1222
1223		v[11] = box->y1 * op->src.scale[1];
1224		v[7] = v[3] = box->y2 * op->src.scale[1];
1225
1226		v += 12;
1227		box++;
1228	} while (--nbox);
1229}
1230
1231sse2 fastcall static void
1232gen3_emit_composite_primitive_affine_source__sse2(struct sna *sna,
1233						  const struct sna_composite_op *op,
1234						  const struct sna_composite_rectangles *r)
1235{
1236	PictTransform *transform = op->src.transform;
1237	int16_t dst_x = r->dst.x + op->dst.x;
1238	int16_t dst_y = r->dst.y + op->dst.y;
1239	int src_x = r->src.x + (int)op->src.offset[0];
1240	int src_y = r->src.y + (int)op->src.offset[1];
1241	float *v;
1242
1243	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d\n",
1244	     __FUNCTION__, src_x, src_y, dst_x, dst_y, r->width, r->height));
1245
1246	v = sna->render.vertices + sna->render.vertex_used;
1247	sna->render.vertex_used += 12;
1248	assert(sna->render.vertex_used <= sna->render.vertex_size);
1249
1250	v[0] = dst_x + r->width;
1251	v[5] = v[1] = dst_y + r->height;
1252	v[8] = v[4] = dst_x;
1253	v[9] = dst_y;
1254
1255	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
1256				    transform, op->src.scale,
1257				    &v[2], &v[3]);
1258
1259	_sna_get_transformed_scaled(src_x, src_y + r->height,
1260				    transform, op->src.scale,
1261				    &v[6], &v[7]);
1262
1263	_sna_get_transformed_scaled(src_x, src_y,
1264				    transform, op->src.scale,
1265				    &v[10], &v[11]);
1266}
1267
1268sse2 fastcall static void
1269gen3_emit_composite_boxes_affine_source__sse2(const struct sna_composite_op *op,
1270					      const BoxRec *box, int nbox,
1271					      float *v)
1272{
1273	const PictTransform *transform = op->src.transform;
1274
1275	do {
1276		DBG(("%s: box=(%d, %d), (%d, %d), src.offset=(%d, %d)\n",
1277		     __FUNCTION__, box->x1, box->y1, box->x2, box->y2, op->src.offset[0], op->src.offset[1]));
1278
1279		v[0] = box->x2 + op->dst.x;
1280		v[5] = v[1] = box->y2 + op->dst.y;
1281		v[8] = v[4] = box->x1 + op->dst.x;
1282		v[9] = box->y1 + op->dst.y;
1283
1284		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
1285					    box->y2 + op->src.offset[1],
1286					    transform, op->src.scale,
1287					    &v[2], &v[3]);
1288
1289		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
1290					    box->y2 + op->src.offset[1],
1291					    transform, op->src.scale,
1292					    &v[6], &v[7]);
1293
1294		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
1295					    box->y1 + op->src.offset[1],
1296					    transform, op->src.scale,
1297					    &v[10], &v[11]);
1298
1299		v += 12;
1300		box++;
1301	} while (--nbox);
1302}
1303
1304sse2 fastcall static void
1305gen3_emit_composite_primitive_constant_identity_mask__sse2(struct sna *sna,
1306							   const struct sna_composite_op *op,
1307							   const struct sna_composite_rectangles *r)
1308{
1309	float w = r->width;
1310	float h = r->height;
1311	float *v;
1312
1313	v = sna->render.vertices + sna->render.vertex_used;
1314	sna->render.vertex_used += 12;
1315	assert(sna->render.vertex_used <= sna->render.vertex_size);
1316
1317	v[8] = v[4] = r->dst.x + op->dst.x;
1318	v[0] = v[4] + w;
1319
1320	v[9] = r->dst.y + op->dst.y;
1321	v[5] = v[1] = v[9] + h;
1322
1323	v[10] = v[6] = (r->mask.x + op->mask.offset[0]) * op->mask.scale[0];
1324	v[2] = v[6] + w * op->mask.scale[0];
1325
1326	v[11] = (r->mask.y + op->mask.offset[1]) * op->mask.scale[1];
1327	v[7] = v[3] = v[11] + h * op->mask.scale[1];
1328}
1329
1330sse2 fastcall static void
1331gen3_emit_composite_primitive_constant_identity_mask_no_offset__sse2(struct sna *sna,
1332								     const struct sna_composite_op *op,
1333								     const struct sna_composite_rectangles *r)
1334{
1335	float w = r->width;
1336	float h = r->height;
1337	float *v;
1338
1339	v = sna->render.vertices + sna->render.vertex_used;
1340	sna->render.vertex_used += 12;
1341	assert(sna->render.vertex_used <= sna->render.vertex_size);
1342
1343	v[8] = v[4] = r->dst.x;
1344	v[9] = r->dst.y;
1345
1346	v[0] = v[4] + w;
1347	v[5] = v[1] = v[9] + h;
1348
1349	v[10] = v[6] = r->mask.x * op->mask.scale[0];
1350	v[11] = r->mask.y * op->mask.scale[1];
1351
1352	v[2] = v[6] + w * op->mask.scale[0];
1353	v[7] = v[3] = v[11] + h * op->mask.scale[1];
1354}
1355
1356sse2 fastcall static void
1357gen3_emit_composite_primitive_identity_source_mask__sse2(struct sna *sna,
1358							 const struct sna_composite_op *op,
1359							 const struct sna_composite_rectangles *r)
1360{
1361	float dst_x, dst_y;
1362	float src_x, src_y;
1363	float msk_x, msk_y;
1364	float w, h;
1365	float *v;
1366
1367	dst_x = r->dst.x + op->dst.x;
1368	dst_y = r->dst.y + op->dst.y;
1369	src_x = r->src.x + op->src.offset[0];
1370	src_y = r->src.y + op->src.offset[1];
1371	msk_x = r->mask.x + op->mask.offset[0];
1372	msk_y = r->mask.y + op->mask.offset[1];
1373	w = r->width;
1374	h = r->height;
1375
1376	v = sna->render.vertices + sna->render.vertex_used;
1377	sna->render.vertex_used += 18;
1378	assert(sna->render.vertex_used <= sna->render.vertex_size);
1379
1380	v[0] = dst_x + w;
1381	v[1] = dst_y + h;
1382	v[2] = (src_x + w) * op->src.scale[0];
1383	v[3] = (src_y + h) * op->src.scale[1];
1384	v[4] = (msk_x + w) * op->mask.scale[0];
1385	v[5] = (msk_y + h) * op->mask.scale[1];
1386
1387	v[6] = dst_x;
1388	v[7] = v[1];
1389	v[8] = src_x * op->src.scale[0];
1390	v[9] = v[3];
1391	v[10] = msk_x * op->mask.scale[0];
1392	v[11] =v[5];
1393
1394	v[12] = v[6];
1395	v[13] = dst_y;
1396	v[14] = v[8];
1397	v[15] = src_y * op->src.scale[1];
1398	v[16] = v[10];
1399	v[17] = msk_y * op->mask.scale[1];
1400}
1401
1402sse2 fastcall static void
1403gen3_emit_composite_primitive_affine_source_mask__sse2(struct sna *sna,
1404						       const struct sna_composite_op *op,
1405						       const struct sna_composite_rectangles *r)
1406{
1407	int16_t src_x, src_y;
1408	float dst_x, dst_y;
1409	float msk_x, msk_y;
1410	float w, h;
1411	float *v;
1412
1413	dst_x = r->dst.x + op->dst.x;
1414	dst_y = r->dst.y + op->dst.y;
1415	src_x = r->src.x + op->src.offset[0];
1416	src_y = r->src.y + op->src.offset[1];
1417	msk_x = r->mask.x + op->mask.offset[0];
1418	msk_y = r->mask.y + op->mask.offset[1];
1419	w = r->width;
1420	h = r->height;
1421
1422	v = sna->render.vertices + sna->render.vertex_used;
1423	sna->render.vertex_used += 18;
1424	assert(sna->render.vertex_used <= sna->render.vertex_size);
1425
1426	v[0] = dst_x + w;
1427	v[1] = dst_y + h;
1428	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
1429				    op->src.transform, op->src.scale,
1430				    &v[2], &v[3]);
1431	v[4] = (msk_x + w) * op->mask.scale[0];
1432	v[5] = (msk_y + h) * op->mask.scale[1];
1433
1434	v[6] = dst_x;
1435	v[7] = v[1];
1436	_sna_get_transformed_scaled(src_x, src_y + r->height,
1437				    op->src.transform, op->src.scale,
1438				    &v[8], &v[9]);
1439	v[10] = msk_x * op->mask.scale[0];
1440	v[11] =v[5];
1441
1442	v[12] = v[6];
1443	v[13] = dst_y;
1444	_sna_get_transformed_scaled(src_x, src_y,
1445				    op->src.transform, op->src.scale,
1446				    &v[14], &v[15]);
1447	v[16] = v[10];
1448	v[17] = msk_y * op->mask.scale[1];
1449}
1450#endif
1451
1452static inline void
1453gen3_2d_perspective(struct sna *sna, int in, int out)
1454{
1455	gen3_fs_rcp(out, 0, gen3_fs_operand(in, W, W, W, W));
1456	gen3_fs_mul(out,
1457		    gen3_fs_operand(in, X, Y, ZERO, ONE),
1458		    gen3_fs_operand_reg(out));
1459}
1460
1461static inline void
1462gen3_linear_coord(struct sna *sna,
1463		  const struct sna_composite_channel *channel,
1464		  int in, int out)
1465{
1466	int c = channel->u.gen3.constants;
1467
1468	if (!channel->is_affine) {
1469		gen3_2d_perspective(sna, in, FS_U0);
1470		in = FS_U0;
1471	}
1472
1473	gen3_fs_mov(out, gen3_fs_operand_zero());
1474	gen3_fs_dp3(out, MASK_X,
1475		    gen3_fs_operand(in, X, Y, ONE, ZERO),
1476		    gen3_fs_operand_reg(c));
1477}
1478
1479static void
1480gen3_radial_coord(struct sna *sna,
1481		  const struct sna_composite_channel *channel,
1482		  int in, int out)
1483{
1484	int c = channel->u.gen3.constants;
1485
1486	if (!channel->is_affine) {
1487		gen3_2d_perspective(sna, in, FS_U0);
1488		in = FS_U0;
1489	}
1490
1491	switch (channel->u.gen3.mode) {
1492	case RADIAL_ONE:
1493		/*
1494		   pdx = (x - c1x) / dr, pdy = (y - c1y) / dr;
1495		   r² = pdx*pdx + pdy*pdy
1496		   t = r²/sqrt(r²) - r1/dr;
1497		   */
1498		gen3_fs_mad(FS_U0, MASK_X | MASK_Y,
1499			    gen3_fs_operand(in, X, Y, ZERO, ZERO),
1500			    gen3_fs_operand(c, Z, Z, ZERO, ZERO),
1501			    gen3_fs_operand(c, NEG_X, NEG_Y, ZERO, ZERO));
1502		gen3_fs_dp2add(FS_U0, MASK_X,
1503			       gen3_fs_operand(FS_U0, X, Y, ZERO, ZERO),
1504			       gen3_fs_operand(FS_U0, X, Y, ZERO, ZERO),
1505			       gen3_fs_operand_zero());
1506		gen3_fs_rsq(out, MASK_X, gen3_fs_operand(FS_U0, X, X, X, X));
1507		gen3_fs_mad(out, 0,
1508			    gen3_fs_operand(FS_U0, X, ZERO, ZERO, ZERO),
1509			    gen3_fs_operand(out, X, ZERO, ZERO, ZERO),
1510			    gen3_fs_operand(c, W, ZERO, ZERO, ZERO));
1511		break;
1512
1513	case RADIAL_TWO:
1514		/*
1515		   pdx = x - c1x, pdy = y - c1y;
1516		   A = dx² + dy² - dr²
1517		   B = -2*(pdx*dx + pdy*dy + r1*dr);
1518		   C = pdx² + pdy² - r1²;
1519		   det = B*B - 4*A*C;
1520		   t = (-B + sqrt (det)) / (2 * A)
1521		   */
1522
1523		/* u0.x = pdx, u0.y = pdy, u[0].z = r1; */
1524		gen3_fs_add(FS_U0,
1525			    gen3_fs_operand(in, X, Y, ZERO, ZERO),
1526			    gen3_fs_operand(c, X, Y, Z, ZERO));
1527		/* u0.x = pdx, u0.y = pdy, u[0].z = r1, u[0].w = B; */
1528		gen3_fs_dp3(FS_U0, MASK_W,
1529			    gen3_fs_operand(FS_U0, X, Y, ONE, ZERO),
1530			    gen3_fs_operand(c+1, X, Y, Z, ZERO));
1531		/* u1.x = pdx² + pdy² - r1²; [C] */
1532		gen3_fs_dp3(FS_U1, MASK_X,
1533			    gen3_fs_operand(FS_U0, X, Y, Z, ZERO),
1534			    gen3_fs_operand(FS_U0, X, Y, NEG_Z, ZERO));
1535		/* u1.x = C, u1.y = B, u1.z=-4*A; */
1536		gen3_fs_mov_masked(FS_U1, MASK_Y, gen3_fs_operand(FS_U0, W, W, W, W));
1537		gen3_fs_mov_masked(FS_U1, MASK_Z, gen3_fs_operand(c, W, W, W, W));
1538		/* u1.x = B² - 4*A*C */
1539		gen3_fs_dp2add(FS_U1, MASK_X,
1540			       gen3_fs_operand(FS_U1, X, Y, ZERO, ZERO),
1541			       gen3_fs_operand(FS_U1, Z, Y, ZERO, ZERO),
1542			       gen3_fs_operand_zero());
1543		/* out.x = -B + sqrt (B² - 4*A*C), */
1544		gen3_fs_rsq(out, MASK_X, gen3_fs_operand(FS_U1, X, X, X, X));
1545		gen3_fs_mad(out, MASK_X,
1546			    gen3_fs_operand(out, X, ZERO, ZERO, ZERO),
1547			    gen3_fs_operand(FS_U1, X, ZERO, ZERO, ZERO),
1548			    gen3_fs_operand(FS_U0, NEG_W, ZERO, ZERO, ZERO));
1549		/* out.x = (-B + sqrt (B² - 4*A*C)) / (2 * A), */
1550		gen3_fs_mul(out,
1551			    gen3_fs_operand(out, X, ZERO, ZERO, ZERO),
1552			    gen3_fs_operand(c+1, W, ZERO, ZERO, ZERO));
1553		break;
1554	}
1555}
1556
1557static void
1558gen3_composite_emit_shader(struct sna *sna,
1559			   const struct sna_composite_op *op,
1560			   uint8_t blend)
1561{
1562	bool dst_is_alpha = PIXMAN_FORMAT_RGB(op->dst.format) == 0;
1563	const struct sna_composite_channel *src, *mask;
1564	struct gen3_render_state *state = &sna->render_state.gen3;
1565	uint32_t shader_offset, id;
1566	int src_reg, mask_reg;
1567	int t, length;
1568
1569	src = &op->src;
1570	mask = &op->mask;
1571	if (mask->u.gen3.type == SHADER_NONE)
1572		mask = NULL;
1573
1574	id = (src->u.gen3.type |
1575	      src->is_affine << 4 |
1576	      src->alpha_fixup << 5 |
1577	      src->rb_reversed << 6);
1578	if (mask) {
1579		id |= (mask->u.gen3.type << 8 |
1580		       mask->is_affine << 12 |
1581		       gen3_blend_op[blend].src_alpha << 13 |
1582		       op->has_component_alpha << 14 |
1583		       mask->alpha_fixup << 15 |
1584		       mask->rb_reversed << 16);
1585	}
1586	id |= dst_is_alpha << 24;
1587	id |= op->rb_reversed << 25;
1588
1589	if (id == state->last_shader)
1590		return;
1591
1592	state->last_shader = id;
1593
1594	shader_offset = sna->kgem.nbatch++;
1595	t = 0;
1596	switch (src->u.gen3.type) {
1597	case SHADER_NONE:
1598	case SHADER_OPACITY:
1599		assert(0);
1600	case SHADER_ZERO:
1601	case SHADER_BLACK:
1602	case SHADER_WHITE:
1603		break;
1604	case SHADER_CONSTANT:
1605		gen3_fs_dcl(FS_T8);
1606		src_reg = FS_T8;
1607		break;
1608	case SHADER_TEXTURE:
1609	case SHADER_RADIAL:
1610	case SHADER_LINEAR:
1611		gen3_fs_dcl(FS_S0);
1612		gen3_fs_dcl(FS_T0);
1613		t++;
1614		break;
1615	}
1616
1617	if (mask == NULL) {
1618		switch (src->u.gen3.type) {
1619		case SHADER_ZERO:
1620			gen3_fs_mov(FS_OC, gen3_fs_operand_zero());
1621			goto done;
1622		case SHADER_BLACK:
1623			if (dst_is_alpha)
1624				gen3_fs_mov(FS_OC, gen3_fs_operand_one());
1625			else
1626				gen3_fs_mov(FS_OC, gen3_fs_operand(FS_R0, ZERO, ZERO, ZERO, ONE));
1627			goto done;
1628		case SHADER_WHITE:
1629			gen3_fs_mov(FS_OC, gen3_fs_operand_one());
1630			goto done;
1631		}
1632		if (src->alpha_fixup && dst_is_alpha) {
1633			gen3_fs_mov(FS_OC, gen3_fs_operand_one());
1634			goto done;
1635		}
1636		/* No mask, so load directly to output color */
1637		if (src->u.gen3.type != SHADER_CONSTANT) {
1638			if (dst_is_alpha || src->rb_reversed ^ op->rb_reversed)
1639				src_reg = FS_R0;
1640			else
1641				src_reg = FS_OC;
1642		}
1643		switch (src->u.gen3.type) {
1644		case SHADER_LINEAR:
1645			gen3_linear_coord(sna, src, FS_T0, FS_R0);
1646			gen3_fs_texld(src_reg, FS_S0, FS_R0);
1647			break;
1648
1649		case SHADER_RADIAL:
1650			gen3_radial_coord(sna, src, FS_T0, FS_R0);
1651			gen3_fs_texld(src_reg, FS_S0, FS_R0);
1652			break;
1653
1654		case SHADER_TEXTURE:
1655			if (src->is_affine)
1656				gen3_fs_texld(src_reg, FS_S0, FS_T0);
1657			else
1658				gen3_fs_texldp(src_reg, FS_S0, FS_T0);
1659			break;
1660
1661		case SHADER_NONE:
1662		case SHADER_WHITE:
1663		case SHADER_BLACK:
1664		case SHADER_ZERO:
1665			assert(0);
1666		case SHADER_CONSTANT:
1667			break;
1668		}
1669
1670		if (src_reg != FS_OC) {
1671			if (src->alpha_fixup)
1672				gen3_fs_mov(FS_OC,
1673					    src->rb_reversed ^ op->rb_reversed ?
1674					    gen3_fs_operand(src_reg, Z, Y, X, ONE) :
1675					    gen3_fs_operand(src_reg, X, Y, Z, ONE));
1676			else if (dst_is_alpha)
1677				gen3_fs_mov(FS_OC, gen3_fs_operand(src_reg, W, W, W, W));
1678			else if (src->rb_reversed ^ op->rb_reversed)
1679				gen3_fs_mov(FS_OC, gen3_fs_operand(src_reg, Z, Y, X, W));
1680			else
1681				gen3_fs_mov(FS_OC, gen3_fs_operand_reg(src_reg));
1682		} else if (src->alpha_fixup)
1683			gen3_fs_mov_masked(FS_OC, MASK_W, gen3_fs_operand_one());
1684	} else {
1685		int out_reg = FS_OC;
1686		if (op->rb_reversed)
1687			out_reg = FS_U0;
1688
1689		switch (mask->u.gen3.type) {
1690		case SHADER_CONSTANT:
1691			gen3_fs_dcl(FS_T9);
1692			mask_reg = FS_T9;
1693			break;
1694		case SHADER_TEXTURE:
1695		case SHADER_LINEAR:
1696		case SHADER_RADIAL:
1697			gen3_fs_dcl(FS_S0 + t);
1698			/* fall through */
1699		case SHADER_OPACITY:
1700			gen3_fs_dcl(FS_T0 + t);
1701			break;
1702		case SHADER_ZERO:
1703		case SHADER_BLACK:
1704			assert(0);
1705		case SHADER_NONE:
1706		case SHADER_WHITE:
1707			break;
1708		}
1709
1710		t = 0;
1711		switch (src->u.gen3.type) {
1712		case SHADER_LINEAR:
1713			gen3_linear_coord(sna, src, FS_T0, FS_R0);
1714			gen3_fs_texld(FS_R0, FS_S0, FS_R0);
1715			src_reg = FS_R0;
1716			t++;
1717			break;
1718
1719		case SHADER_RADIAL:
1720			gen3_radial_coord(sna, src, FS_T0, FS_R0);
1721			gen3_fs_texld(FS_R0, FS_S0, FS_R0);
1722			src_reg = FS_R0;
1723			t++;
1724			break;
1725
1726		case SHADER_TEXTURE:
1727			if (src->is_affine)
1728				gen3_fs_texld(FS_R0, FS_S0, FS_T0);
1729			else
1730				gen3_fs_texldp(FS_R0, FS_S0, FS_T0);
1731			src_reg = FS_R0;
1732			t++;
1733			break;
1734
1735		case SHADER_CONSTANT:
1736		case SHADER_NONE:
1737		case SHADER_ZERO:
1738		case SHADER_BLACK:
1739		case SHADER_WHITE:
1740			break;
1741		}
1742		if (src->alpha_fixup)
1743			gen3_fs_mov_masked(src_reg, MASK_W, gen3_fs_operand_one());
1744		if (src->rb_reversed)
1745			gen3_fs_mov(src_reg, gen3_fs_operand(src_reg, Z, Y, X, W));
1746
1747		switch (mask->u.gen3.type) {
1748		case SHADER_LINEAR:
1749			gen3_linear_coord(sna, mask, FS_T0 + t, FS_R1);
1750			gen3_fs_texld(FS_R1, FS_S0 + t, FS_R1);
1751			mask_reg = FS_R1;
1752			break;
1753
1754		case SHADER_RADIAL:
1755			gen3_radial_coord(sna, mask, FS_T0 + t, FS_R1);
1756			gen3_fs_texld(FS_R1, FS_S0 + t, FS_R1);
1757			mask_reg = FS_R1;
1758			break;
1759
1760		case SHADER_TEXTURE:
1761			if (mask->is_affine)
1762				gen3_fs_texld(FS_R1, FS_S0 + t, FS_T0 + t);
1763			else
1764				gen3_fs_texldp(FS_R1, FS_S0 + t, FS_T0 + t);
1765			mask_reg = FS_R1;
1766			break;
1767
1768		case SHADER_OPACITY:
1769			switch (src->u.gen3.type) {
1770			case SHADER_BLACK:
1771			case SHADER_WHITE:
1772				if (dst_is_alpha || src->u.gen3.type == SHADER_WHITE) {
1773					gen3_fs_mov(out_reg,
1774						    gen3_fs_operand(FS_T0 + t, X, X, X, X));
1775				} else {
1776					gen3_fs_mov(out_reg,
1777						    gen3_fs_operand(FS_T0 + t, ZERO, ZERO, ZERO, X));
1778				}
1779				break;
1780			default:
1781				if (dst_is_alpha) {
1782					gen3_fs_mul(out_reg,
1783						    gen3_fs_operand(src_reg, W, W, W, W),
1784						    gen3_fs_operand(FS_T0 + t, X, X, X, X));
1785				} else {
1786					gen3_fs_mul(out_reg,
1787						    gen3_fs_operand(src_reg, X, Y, Z, W),
1788						    gen3_fs_operand(FS_T0 + t, X, X, X, X));
1789				}
1790			}
1791			goto mask_done;
1792
1793		case SHADER_CONSTANT:
1794		case SHADER_ZERO:
1795		case SHADER_BLACK:
1796		case SHADER_WHITE:
1797		case SHADER_NONE:
1798			break;
1799		}
1800		if (mask->alpha_fixup)
1801			gen3_fs_mov_masked(mask_reg, MASK_W, gen3_fs_operand_one());
1802		if (mask->rb_reversed)
1803			gen3_fs_mov(mask_reg, gen3_fs_operand(mask_reg, Z, Y, X, W));
1804
1805		if (dst_is_alpha) {
1806			switch (src->u.gen3.type) {
1807			case SHADER_BLACK:
1808			case SHADER_WHITE:
1809				gen3_fs_mov(out_reg,
1810					    gen3_fs_operand(mask_reg, W, W, W, W));
1811				break;
1812			default:
1813				gen3_fs_mul(out_reg,
1814					    gen3_fs_operand(src_reg, W, W, W, W),
1815					    gen3_fs_operand(mask_reg, W, W, W, W));
1816				break;
1817			}
1818		} else {
1819			/* If component alpha is active in the mask and the blend
1820			 * operation uses the source alpha, then we know we don't
1821			 * need the source value (otherwise we would have hit a
1822			 * fallback earlier), so we provide the source alpha (src.A *
1823			 * mask.X) as output color.
1824			 * Conversely, if CA is set and we don't need the source alpha,
1825			 * then we produce the source value (src.X * mask.X) and the
1826			 * source alpha is unused.  Otherwise, we provide the non-CA
1827			 * source value (src.X * mask.A).
1828			 */
1829			if (op->has_component_alpha) {
1830				switch (src->u.gen3.type) {
1831				case SHADER_BLACK:
1832					if (gen3_blend_op[blend].src_alpha)
1833						gen3_fs_mov(out_reg,
1834							    gen3_fs_operand_reg(mask_reg));
1835					else
1836						gen3_fs_mov(out_reg,
1837							    gen3_fs_operand(mask_reg, ZERO, ZERO, ZERO, W));
1838					break;
1839				case SHADER_WHITE:
1840					gen3_fs_mov(out_reg,
1841						    gen3_fs_operand_reg(mask_reg));
1842					break;
1843				default:
1844					if (gen3_blend_op[blend].src_alpha)
1845						gen3_fs_mul(out_reg,
1846							    gen3_fs_operand(src_reg, W, W, W, W),
1847							    gen3_fs_operand_reg(mask_reg));
1848					else
1849						gen3_fs_mul(out_reg,
1850							    gen3_fs_operand_reg(src_reg),
1851							    gen3_fs_operand_reg(mask_reg));
1852					break;
1853				}
1854			} else {
1855				switch (src->u.gen3.type) {
1856				case SHADER_WHITE:
1857					gen3_fs_mov(out_reg,
1858						    gen3_fs_operand(mask_reg, W, W, W, W));
1859					break;
1860				case SHADER_BLACK:
1861					gen3_fs_mov(out_reg,
1862						    gen3_fs_operand(mask_reg, ZERO, ZERO, ZERO, W));
1863					break;
1864				default:
1865					gen3_fs_mul(out_reg,
1866						    gen3_fs_operand_reg(src_reg),
1867						    gen3_fs_operand(mask_reg, W, W, W, W));
1868					break;
1869				}
1870			}
1871		}
1872mask_done:
1873		if (op->rb_reversed)
1874			gen3_fs_mov(FS_OC, gen3_fs_operand(FS_U0, Z, Y, X, W));
1875	}
1876
1877done:
1878	length = sna->kgem.nbatch - shader_offset;
1879	sna->kgem.batch[shader_offset] =
1880		_3DSTATE_PIXEL_SHADER_PROGRAM | (length - 2);
1881}
1882
1883static uint32_t gen3_ms_tiling(uint32_t tiling)
1884{
1885	uint32_t v = 0;
1886	switch (tiling) {
1887	case I915_TILING_Y: v |= MS3_TILE_WALK;
1888	case I915_TILING_X: v |= MS3_TILED_SURFACE;
1889	case I915_TILING_NONE: break;
1890	}
1891	return v;
1892}
1893
1894static void gen3_emit_invariant(struct sna *sna)
1895{
1896	/* Disable independent alpha blend */
1897	OUT_BATCH(_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD | IAB_MODIFY_ENABLE |
1898		  IAB_MODIFY_FUNC | BLENDFUNC_ADD << IAB_FUNC_SHIFT |
1899		  IAB_MODIFY_SRC_FACTOR | BLENDFACT_ONE << IAB_SRC_FACTOR_SHIFT |
1900		  IAB_MODIFY_DST_FACTOR | BLENDFACT_ZERO << IAB_DST_FACTOR_SHIFT);
1901
1902	OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
1903		  CSB_TCB(0, 0) |
1904		  CSB_TCB(1, 1) |
1905		  CSB_TCB(2, 2) |
1906		  CSB_TCB(3, 3) |
1907		  CSB_TCB(4, 4) |
1908		  CSB_TCB(5, 5) |
1909		  CSB_TCB(6, 6) |
1910		  CSB_TCB(7, 7));
1911
1912	OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | I1_LOAD_S(4) | I1_LOAD_S(5) | I1_LOAD_S(6) | 3);
1913	OUT_BATCH(0); /* Disable texture coordinate wrap-shortest */
1914	OUT_BATCH((1 << S4_POINT_WIDTH_SHIFT) |
1915		  S4_LINE_WIDTH_ONE |
1916		  S4_CULLMODE_NONE |
1917		  S4_VFMT_XY);
1918	OUT_BATCH(0); /* Disable fog/stencil. *Enable* write mask. */
1919	OUT_BATCH(S6_COLOR_WRITE_ONLY); /* Disable blending, depth */
1920
1921	OUT_BATCH(_3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT);
1922	OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE);
1923
1924	OUT_BATCH(_3DSTATE_LOAD_INDIRECT);
1925	OUT_BATCH(0x00000000);
1926
1927	OUT_BATCH(_3DSTATE_STIPPLE);
1928	OUT_BATCH(0x00000000);
1929
1930	sna->render_state.gen3.need_invariant = false;
1931}
1932
1933#define MAX_OBJECTS 3 /* worst case: dst + src + mask  */
1934
1935static void
1936gen3_get_batch(struct sna *sna, const struct sna_composite_op *op)
1937{
1938	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
1939
1940	if (!kgem_check_batch(&sna->kgem, 200)) {
1941		DBG(("%s: flushing batch: size %d > %d\n",
1942		     __FUNCTION__, 200,
1943		     sna->kgem.surface-sna->kgem.nbatch));
1944		kgem_submit(&sna->kgem);
1945		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
1946	}
1947
1948	if (!kgem_check_reloc(&sna->kgem, MAX_OBJECTS)) {
1949		DBG(("%s: flushing batch: reloc %d >= %d\n",
1950		     __FUNCTION__,
1951		     sna->kgem.nreloc,
1952		     (int)KGEM_RELOC_SIZE(&sna->kgem) - MAX_OBJECTS));
1953		kgem_submit(&sna->kgem);
1954		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
1955	}
1956
1957	if (!kgem_check_exec(&sna->kgem, MAX_OBJECTS)) {
1958		DBG(("%s: flushing batch: exec %d >= %d\n",
1959		     __FUNCTION__,
1960		     sna->kgem.nexec,
1961		     (int)KGEM_EXEC_SIZE(&sna->kgem) - MAX_OBJECTS - 1));
1962		kgem_submit(&sna->kgem);
1963		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
1964	}
1965
1966	if (sna->render_state.gen3.need_invariant)
1967		gen3_emit_invariant(sna);
1968#undef MAX_OBJECTS
1969}
1970
1971static void gen3_emit_target(struct sna *sna,
1972			     struct kgem_bo *bo,
1973			     int width,
1974			     int height,
1975			     int format)
1976{
1977	struct gen3_render_state *state = &sna->render_state.gen3;
1978
1979	assert(!too_large(width, height));
1980
1981	/* BUF_INFO is an implicit flush, so skip if the target is unchanged. */
1982	assert(bo->unique_id != 0);
1983	if (bo->unique_id != state->current_dst) {
1984		uint32_t v;
1985
1986		DBG(("%s: setting new target id=%d, handle=%d\n",
1987		     __FUNCTION__, bo->unique_id, bo->handle));
1988
1989		OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
1990		OUT_BATCH(BUF_3D_ID_COLOR_BACK |
1991			  gen3_buf_tiling(bo->tiling) |
1992			  bo->pitch);
1993		OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
1994					 bo,
1995					 I915_GEM_DOMAIN_RENDER << 16 |
1996					 I915_GEM_DOMAIN_RENDER,
1997					 0));
1998
1999		OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
2000		OUT_BATCH(gen3_get_dst_format(format));
2001
2002		v = DRAW_YMAX(height - 1) | DRAW_XMAX(width - 1);
2003		if (v != state->last_drawrect_limit) {
2004			OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
2005			OUT_BATCH(0); /* XXX dither origin? */
2006			OUT_BATCH(0);
2007			OUT_BATCH(v);
2008			OUT_BATCH(0);
2009			state->last_drawrect_limit = v;
2010		}
2011
2012		state->current_dst = bo->unique_id;
2013	}
2014	assert(bo->exec);
2015	kgem_bo_mark_dirty(bo);
2016}
2017
2018static void gen3_emit_composite_state(struct sna *sna,
2019				      const struct sna_composite_op *op)
2020{
2021	struct gen3_render_state *state = &sna->render_state.gen3;
2022	uint32_t map[4];
2023	uint32_t sampler[4];
2024	struct kgem_bo *bo[2];
2025	unsigned int tex_count, n;
2026	uint32_t ss2;
2027
2028	gen3_get_batch(sna, op);
2029
2030	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
2031		if (op->src.bo == op->dst.bo || op->mask.bo == op->dst.bo)
2032			OUT_BATCH(MI_FLUSH | MI_INVALIDATE_MAP_CACHE);
2033		else
2034			OUT_BATCH(_3DSTATE_MODES_5_CMD |
2035				  PIPELINE_FLUSH_RENDER_CACHE |
2036				  PIPELINE_FLUSH_TEXTURE_CACHE);
2037		kgem_clear_dirty(&sna->kgem);
2038	}
2039
2040	gen3_emit_target(sna,
2041			 op->dst.bo,
2042			 op->dst.width,
2043			 op->dst.height,
2044			 op->dst.format);
2045
2046	ss2 = ~0;
2047	tex_count = 0;
2048	switch (op->src.u.gen3.type) {
2049	case SHADER_OPACITY:
2050	case SHADER_NONE:
2051		assert(0);
2052	case SHADER_ZERO:
2053	case SHADER_BLACK:
2054	case SHADER_WHITE:
2055		break;
2056	case SHADER_CONSTANT:
2057		if (op->src.u.gen3.mode != state->last_diffuse) {
2058			OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
2059			OUT_BATCH(op->src.u.gen3.mode);
2060			state->last_diffuse = op->src.u.gen3.mode;
2061		}
2062		break;
2063	case SHADER_LINEAR:
2064	case SHADER_RADIAL:
2065	case SHADER_TEXTURE:
2066		ss2 &= ~S2_TEXCOORD_FMT(tex_count, TEXCOORDFMT_NOT_PRESENT);
2067		ss2 |= S2_TEXCOORD_FMT(tex_count,
2068				       op->src.is_affine ? TEXCOORDFMT_2D : TEXCOORDFMT_4D);
2069		assert(op->src.card_format);
2070		map[tex_count * 2 + 0] =
2071			op->src.card_format |
2072			gen3_ms_tiling(op->src.bo->tiling) |
2073			(op->src.height - 1) << MS3_HEIGHT_SHIFT |
2074			(op->src.width - 1) << MS3_WIDTH_SHIFT;
2075		map[tex_count * 2 + 1] =
2076			(op->src.bo->pitch / 4 - 1) << MS4_PITCH_SHIFT;
2077
2078		sampler[tex_count * 2 + 0] = op->src.filter;
2079		sampler[tex_count * 2 + 1] =
2080			op->src.repeat |
2081			tex_count << SS3_TEXTUREMAP_INDEX_SHIFT;
2082		bo[tex_count] = op->src.bo;
2083		tex_count++;
2084		break;
2085	}
2086	switch (op->mask.u.gen3.type) {
2087	case SHADER_NONE:
2088	case SHADER_ZERO:
2089	case SHADER_BLACK:
2090	case SHADER_WHITE:
2091		break;
2092	case SHADER_CONSTANT:
2093		if (op->mask.u.gen3.mode != state->last_specular) {
2094			OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD);
2095			OUT_BATCH(op->mask.u.gen3.mode);
2096			state->last_specular = op->mask.u.gen3.mode;
2097		}
2098		break;
2099	case SHADER_LINEAR:
2100	case SHADER_RADIAL:
2101	case SHADER_TEXTURE:
2102		ss2 &= ~S2_TEXCOORD_FMT(tex_count, TEXCOORDFMT_NOT_PRESENT);
2103		ss2 |= S2_TEXCOORD_FMT(tex_count,
2104				       op->mask.is_affine ? TEXCOORDFMT_2D : TEXCOORDFMT_4D);
2105		assert(op->mask.card_format);
2106		map[tex_count * 2 + 0] =
2107			op->mask.card_format |
2108			gen3_ms_tiling(op->mask.bo->tiling) |
2109			(op->mask.height - 1) << MS3_HEIGHT_SHIFT |
2110			(op->mask.width - 1) << MS3_WIDTH_SHIFT;
2111		map[tex_count * 2 + 1] =
2112			(op->mask.bo->pitch / 4 - 1) << MS4_PITCH_SHIFT;
2113
2114		sampler[tex_count * 2 + 0] = op->mask.filter;
2115		sampler[tex_count * 2 + 1] =
2116			op->mask.repeat |
2117			tex_count << SS3_TEXTUREMAP_INDEX_SHIFT;
2118		bo[tex_count] = op->mask.bo;
2119		tex_count++;
2120		break;
2121	case SHADER_OPACITY:
2122		ss2 &= ~S2_TEXCOORD_FMT(tex_count, TEXCOORDFMT_NOT_PRESENT);
2123		ss2 |= S2_TEXCOORD_FMT(tex_count, TEXCOORDFMT_1D);
2124		break;
2125	}
2126
2127	{
2128		uint32_t blend_offset = sna->kgem.nbatch;
2129
2130		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | I1_LOAD_S(6) | 1);
2131		OUT_BATCH(ss2);
2132		OUT_BATCH(gen3_get_blend_cntl(op->op,
2133					      op->has_component_alpha,
2134					      op->dst.format));
2135
2136		if (memcmp(sna->kgem.batch + state->last_blend + 1,
2137			   sna->kgem.batch + blend_offset + 1,
2138			   2 * 4) == 0)
2139			sna->kgem.nbatch = blend_offset;
2140		else
2141			state->last_blend = blend_offset;
2142	}
2143
2144	if (op->u.gen3.num_constants) {
2145		int count = op->u.gen3.num_constants;
2146		if (state->last_constants) {
2147			int last = sna->kgem.batch[state->last_constants+1];
2148			if (last == (1 << (count >> 2)) - 1 &&
2149			    memcmp(&sna->kgem.batch[state->last_constants+2],
2150				   op->u.gen3.constants,
2151				   count * sizeof(uint32_t)) == 0)
2152				count = 0;
2153		}
2154		if (count) {
2155			state->last_constants = sna->kgem.nbatch;
2156			OUT_BATCH(_3DSTATE_PIXEL_SHADER_CONSTANTS | count);
2157			OUT_BATCH((1 << (count >> 2)) - 1);
2158
2159			memcpy(sna->kgem.batch + sna->kgem.nbatch,
2160			       op->u.gen3.constants,
2161			       count * sizeof(uint32_t));
2162			sna->kgem.nbatch += count;
2163		}
2164	}
2165
2166	if (tex_count != 0) {
2167		uint32_t rewind;
2168
2169		n = 0;
2170		if (tex_count == state->tex_count) {
2171			for (; n < tex_count; n++) {
2172				if (map[2*n+0] != state->tex_map[2*n+0] ||
2173				    map[2*n+1] != state->tex_map[2*n+1] ||
2174				    state->tex_handle[n] != bo[n]->handle ||
2175				    state->tex_delta[n] != bo[n]->delta)
2176					break;
2177			}
2178		}
2179		if (n < tex_count) {
2180			OUT_BATCH(_3DSTATE_MAP_STATE | (3 * tex_count));
2181			OUT_BATCH((1 << tex_count) - 1);
2182			for (n = 0; n < tex_count; n++) {
2183				OUT_BATCH(kgem_add_reloc(&sna->kgem,
2184							 sna->kgem.nbatch,
2185							 bo[n],
2186							 I915_GEM_DOMAIN_SAMPLER<< 16,
2187							 0));
2188				OUT_BATCH(map[2*n + 0]);
2189				OUT_BATCH(map[2*n + 1]);
2190
2191				state->tex_map[2*n+0] = map[2*n+0];
2192				state->tex_map[2*n+1] = map[2*n+1];
2193				state->tex_handle[n] = bo[n]->handle;
2194				state->tex_delta[n] = bo[n]->delta;
2195			}
2196			state->tex_count = n;
2197		}
2198
2199		rewind = sna->kgem.nbatch;
2200		OUT_BATCH(_3DSTATE_SAMPLER_STATE | (3 * tex_count));
2201		OUT_BATCH((1 << tex_count) - 1);
2202		for (n = 0; n < tex_count; n++) {
2203			OUT_BATCH(sampler[2*n + 0]);
2204			OUT_BATCH(sampler[2*n + 1]);
2205			OUT_BATCH(0);
2206		}
2207		if (state->last_sampler &&
2208		    memcmp(&sna->kgem.batch[state->last_sampler+1],
2209			   &sna->kgem.batch[rewind + 1],
2210			   (3*tex_count + 1)*sizeof(uint32_t)) == 0)
2211			sna->kgem.nbatch = rewind;
2212		else
2213			state->last_sampler = rewind;
2214	}
2215
2216	gen3_composite_emit_shader(sna, op, op->op);
2217}
2218
2219static bool gen3_magic_ca_pass(struct sna *sna,
2220			       const struct sna_composite_op *op)
2221{
2222	if (!op->need_magic_ca_pass)
2223		return false;
2224
2225	DBG(("%s(%d)\n", __FUNCTION__,
2226	     sna->render.vertex_index - sna->render.vertex_start));
2227
2228	OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(6) | 0);
2229	OUT_BATCH(gen3_get_blend_cntl(PictOpAdd, true, op->dst.format));
2230	gen3_composite_emit_shader(sna, op, PictOpAdd);
2231
2232	OUT_BATCH(PRIM3D_RECTLIST | PRIM3D_INDIRECT_SEQUENTIAL |
2233		  (sna->render.vertex_index - sna->render.vertex_start));
2234	OUT_BATCH(sna->render.vertex_start);
2235
2236	sna->render_state.gen3.last_blend = 0;
2237	return true;
2238}
2239
2240static void gen3_vertex_flush(struct sna *sna)
2241{
2242	assert(sna->render.vertex_offset);
2243
2244	DBG(("%s[%x] = %d\n", __FUNCTION__,
2245	     4*sna->render.vertex_offset,
2246	     sna->render.vertex_index - sna->render.vertex_start));
2247
2248	sna->kgem.batch[sna->render.vertex_offset] =
2249		PRIM3D_RECTLIST | PRIM3D_INDIRECT_SEQUENTIAL |
2250		(sna->render.vertex_index - sna->render.vertex_start);
2251	sna->kgem.batch[sna->render.vertex_offset + 1] =
2252		sna->render.vertex_start;
2253
2254	sna->render.vertex_offset = 0;
2255}
2256
2257static int gen3_vertex_finish(struct sna *sna)
2258{
2259	struct kgem_bo *bo;
2260	unsigned hint, size;
2261
2262	DBG(("%s: used=%d/%d, vbo active? %d\n",
2263	     __FUNCTION__, sna->render.vertex_used, sna->render.vertex_size,
2264	     sna->render.vbo ? sna->render.vbo->handle : 0));
2265	assert(sna->render.vertex_offset == 0);
2266	assert(sna->render.vertex_used);
2267	assert(sna->render.vertex_used <= sna->render.vertex_size);
2268
2269	sna_vertex_wait__locked(&sna->render);
2270
2271	hint = CREATE_GTT_MAP;
2272	bo = sna->render.vbo;
2273	if (bo) {
2274		DBG(("%s: reloc = %d\n", __FUNCTION__,
2275		     sna->render.vertex_reloc[0]));
2276
2277		if (sna->render.vertex_reloc[0]) {
2278			sna->kgem.batch[sna->render.vertex_reloc[0]] =
2279				kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
2280					       bo, I915_GEM_DOMAIN_VERTEX << 16 | KGEM_RELOC_FENCED, 0);
2281
2282			sna->render.vertex_reloc[0] = 0;
2283		}
2284		sna->render.vertex_used = 0;
2285		sna->render.vertex_index = 0;
2286		sna->render.vbo = NULL;
2287
2288		kgem_bo_destroy(&sna->kgem, bo);
2289		hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
2290	}
2291
2292	size = 256*1024;
2293	sna->render.vertices = NULL;
2294	sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
2295	while (sna->render.vbo == NULL && size > sizeof(sna->render.vertex_data)) {
2296		size /= 2;
2297		sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
2298	}
2299	if (sna->render.vbo == NULL)
2300		sna->render.vbo = kgem_create_linear(&sna->kgem,
2301						     256*1024, CREATE_GTT_MAP);
2302	if (sna->render.vbo &&
2303	    kgem_check_bo(&sna->kgem, sna->render.vbo, NULL))
2304		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
2305	if (sna->render.vertices == NULL) {
2306		if (sna->render.vbo) {
2307			kgem_bo_destroy(&sna->kgem, sna->render.vbo);
2308			sna->render.vbo = NULL;
2309		}
2310		sna->render.vertices = sna->render.vertex_data;
2311		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
2312		return 0;
2313	}
2314	assert(sna->render.vbo->snoop == false);
2315
2316	if (sna->render.vertex_used) {
2317		memcpy(sna->render.vertices,
2318		       sna->render.vertex_data,
2319		       sizeof(float)*sna->render.vertex_used);
2320	}
2321
2322	size = __kgem_bo_size(sna->render.vbo)/4;
2323	if (size >= UINT16_MAX)
2324		size = UINT16_MAX - 1;
2325	assert(size > sna->render.vertex_used);
2326
2327	sna->render.vertex_size = size;
2328	return size - sna->render.vertex_used;
2329}
2330
2331static void gen3_vertex_close(struct sna *sna)
2332{
2333	struct kgem_bo *bo, *free_bo = NULL;
2334	unsigned int delta = 0;
2335
2336	assert(sna->render.vertex_offset == 0);
2337	if (sna->render.vertex_reloc[0] == 0)
2338		return;
2339
2340	DBG(("%s: used=%d/%d, vbo active? %d\n",
2341	     __FUNCTION__, sna->render.vertex_used, sna->render.vertex_size,
2342	     sna->render.vbo ? sna->render.vbo->handle : 0));
2343
2344	bo = sna->render.vbo;
2345	if (bo) {
2346		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
2347			DBG(("%s: discarding full vbo\n", __FUNCTION__));
2348			sna->render.vbo = NULL;
2349			sna->render.vertices = sna->render.vertex_data;
2350			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
2351			free_bo = bo;
2352		} else if (sna->render.vertices == MAP(bo->map__cpu)) {
2353			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
2354			sna->render.vertices = kgem_bo_map__gtt(&sna->kgem, bo);
2355			if (sna->render.vertices == NULL) {
2356				DBG(("%s: discarding non-mappable vertices\n",__FUNCTION__));
2357				sna->render.vbo = NULL;
2358				sna->render.vertices = sna->render.vertex_data;
2359				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
2360				free_bo = bo;
2361			}
2362		}
2363	} else {
2364		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
2365			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
2366			     sna->render.vertex_used, sna->kgem.nbatch));
2367			memcpy(sna->kgem.batch + sna->kgem.nbatch,
2368			       sna->render.vertex_data,
2369			       sna->render.vertex_used * 4);
2370			delta = sna->kgem.nbatch * 4;
2371			bo = NULL;
2372			sna->kgem.nbatch += sna->render.vertex_used;
2373		} else {
2374			DBG(("%s: new vbo: %d\n", __FUNCTION__,
2375			     sna->render.vertex_used));
2376			bo = kgem_create_linear(&sna->kgem,
2377						4*sna->render.vertex_used,
2378						CREATE_NO_THROTTLE);
2379			if (bo) {
2380				assert(bo->snoop == false);
2381				kgem_bo_write(&sna->kgem, bo,
2382					      sna->render.vertex_data,
2383					      4*sna->render.vertex_used);
2384			}
2385			free_bo = bo;
2386		}
2387	}
2388
2389	DBG(("%s: reloc = %d\n", __FUNCTION__, sna->render.vertex_reloc[0]));
2390	sna->kgem.batch[sna->render.vertex_reloc[0]] =
2391		kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
2392			       bo, I915_GEM_DOMAIN_VERTEX << 16 | KGEM_RELOC_FENCED, delta);
2393	sna->render.vertex_reloc[0] = 0;
2394
2395	if (sna->render.vbo == NULL) {
2396		DBG(("%s: resetting vbo\n", __FUNCTION__));
2397		sna->render.vertex_used = 0;
2398		sna->render.vertex_index = 0;
2399		assert(sna->render.vertices == sna->render.vertex_data);
2400		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
2401	}
2402
2403	if (free_bo)
2404		kgem_bo_destroy(&sna->kgem, free_bo);
2405}
2406
2407static bool gen3_rectangle_begin(struct sna *sna,
2408				 const struct sna_composite_op *op)
2409{
2410	struct gen3_render_state *state = &sna->render_state.gen3;
2411	int ndwords, i1_cmd = 0, i1_len = 0;
2412
2413	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
2414		return true;
2415
2416	ndwords = 2;
2417	if (op->need_magic_ca_pass)
2418		ndwords += 100;
2419	if (sna->render.vertex_reloc[0] == 0)
2420		i1_len++, i1_cmd |= I1_LOAD_S(0), ndwords++;
2421	if (state->floats_per_vertex != op->floats_per_vertex)
2422		i1_len++, i1_cmd |= I1_LOAD_S(1), ndwords++;
2423
2424	if (!kgem_check_batch(&sna->kgem, ndwords+1))
2425		return false;
2426
2427	if (i1_cmd) {
2428		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | i1_cmd | (i1_len - 1));
2429		if (sna->render.vertex_reloc[0] == 0)
2430			sna->render.vertex_reloc[0] = sna->kgem.nbatch++;
2431		if (state->floats_per_vertex != op->floats_per_vertex) {
2432			state->floats_per_vertex = op->floats_per_vertex;
2433			OUT_BATCH(state->floats_per_vertex << S1_VERTEX_WIDTH_SHIFT |
2434				  state->floats_per_vertex << S1_VERTEX_PITCH_SHIFT);
2435		}
2436	}
2437
2438	if (sna->kgem.nbatch == 2 + state->last_vertex_offset &&
2439	    !op->need_magic_ca_pass) {
2440		sna->render.vertex_offset = state->last_vertex_offset;
2441	} else {
2442		sna->render.vertex_offset = sna->kgem.nbatch;
2443		OUT_BATCH(MI_NOOP); /* to be filled later */
2444		OUT_BATCH(MI_NOOP);
2445		sna->render.vertex_start = sna->render.vertex_index;
2446		state->last_vertex_offset = sna->render.vertex_offset;
2447	}
2448
2449	return true;
2450}
2451
2452static int gen3_get_rectangles__flush(struct sna *sna,
2453				      const struct sna_composite_op *op)
2454{
2455	/* Preventing discarding new vbo after lock contention */
2456	if (sna_vertex_wait__locked(&sna->render)) {
2457		int rem = vertex_space(sna);
2458		if (rem > op->floats_per_rect)
2459			return rem;
2460	}
2461
2462	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 105: 5))
2463		return 0;
2464	if (!kgem_check_reloc_and_exec(&sna->kgem, 1))
2465		return 0;
2466
2467	if (sna->render.vertex_offset) {
2468		gen3_vertex_flush(sna);
2469		if (gen3_magic_ca_pass(sna, op)) {
2470			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(6) | 0);
2471			OUT_BATCH(gen3_get_blend_cntl(op->op,
2472						      op->has_component_alpha,
2473						      op->dst.format));
2474			gen3_composite_emit_shader(sna, op, op->op);
2475		}
2476	}
2477
2478	return gen3_vertex_finish(sna);
2479}
2480
2481inline static int gen3_get_rectangles(struct sna *sna,
2482				      const struct sna_composite_op *op,
2483				      int want)
2484{
2485	int rem;
2486
2487	DBG(("%s: want=%d, rem=%d\n",
2488	     __FUNCTION__, want*op->floats_per_rect, vertex_space(sna)));
2489
2490	assert(want);
2491	assert(sna->render.vertex_index * op->floats_per_vertex == sna->render.vertex_used);
2492
2493start:
2494	rem = vertex_space(sna);
2495	if (unlikely(op->floats_per_rect > rem)) {
2496		DBG(("flushing vbo for %s: %d < %d\n",
2497		     __FUNCTION__, rem, op->floats_per_rect));
2498		rem = gen3_get_rectangles__flush(sna, op);
2499		if (unlikely(rem == 0))
2500			goto flush;
2501	}
2502
2503	if (unlikely(sna->render.vertex_offset == 0)) {
2504		if (!gen3_rectangle_begin(sna, op))
2505			goto flush;
2506		else
2507			goto start;
2508	}
2509
2510	assert(rem <= vertex_space(sna));
2511	assert(op->floats_per_rect <= rem);
2512	if (want > 1 && want * op->floats_per_rect > rem)
2513		want = rem / op->floats_per_rect;
2514	sna->render.vertex_index += 3*want;
2515
2516	assert(want);
2517	assert(sna->render.vertex_index * op->floats_per_vertex <= sna->render.vertex_size);
2518	return want;
2519
2520flush:
2521	DBG(("%s: flushing batch\n", __FUNCTION__));
2522	if (sna->render.vertex_offset) {
2523		gen3_vertex_flush(sna);
2524		gen3_magic_ca_pass(sna, op);
2525	}
2526	sna_vertex_wait__locked(&sna->render);
2527	_kgem_submit(&sna->kgem);
2528	gen3_emit_composite_state(sna, op);
2529	assert(sna->render.vertex_offset == 0);
2530	assert(sna->render.vertex_reloc[0] == 0);
2531	goto start;
2532}
2533
2534fastcall static void
2535gen3_render_composite_blt(struct sna *sna,
2536			  const struct sna_composite_op *op,
2537			  const struct sna_composite_rectangles *r)
2538{
2539	DBG(("%s: src=(%d, %d)+(%d, %d), mask=(%d, %d)+(%d, %d), dst=(%d, %d)+(%d, %d), size=(%d, %d)\n", __FUNCTION__,
2540	     r->src.x, r->src.y, op->src.offset[0], op->src.offset[1],
2541	     r->mask.x, r->mask.y, op->mask.offset[0], op->mask.offset[1],
2542	     r->dst.x, r->dst.y, op->dst.x, op->dst.y,
2543	     r->width, r->height));
2544
2545	gen3_get_rectangles(sna, op, 1);
2546
2547	op->prim_emit(sna, op, r);
2548}
2549
2550fastcall static void
2551gen3_render_composite_box(struct sna *sna,
2552			  const struct sna_composite_op *op,
2553			  const BoxRec *box)
2554{
2555	struct sna_composite_rectangles r;
2556
2557	DBG(("%s: src=+(%d, %d), mask=+(%d, %d), dst=+(%d, %d)\n",
2558	     __FUNCTION__,
2559	     op->src.offset[0], op->src.offset[1],
2560	     op->mask.offset[0], op->mask.offset[1],
2561	     op->dst.x, op->dst.y));
2562
2563	gen3_get_rectangles(sna, op, 1);
2564
2565	r.dst.x  = box->x1;
2566	r.dst.y  = box->y1;
2567	r.width  = box->x2 - box->x1;
2568	r.height = box->y2 - box->y1;
2569	r.src = r.mask = r.dst;
2570
2571	op->prim_emit(sna, op, &r);
2572}
2573
2574static void
2575gen3_render_composite_boxes__blt(struct sna *sna,
2576				 const struct sna_composite_op *op,
2577				 const BoxRec *box, int nbox)
2578{
2579	DBG(("%s: nbox=%d, src=+(%d, %d), mask=+(%d, %d), dst=+(%d, %d)\n",
2580	     __FUNCTION__, nbox,
2581	     op->src.offset[0], op->src.offset[1],
2582	     op->mask.offset[0], op->mask.offset[1],
2583	     op->dst.x, op->dst.y));
2584
2585	do {
2586		int nbox_this_time;
2587
2588		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
2589		nbox -= nbox_this_time;
2590
2591		do {
2592			struct sna_composite_rectangles r;
2593
2594			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
2595			     box->x1, box->y1,
2596			     box->x2 - box->x1,
2597			     box->y2 - box->y1));
2598
2599			r.dst.x  = box->x1; r.dst.y  = box->y1;
2600			r.width = box->x2 - box->x1;
2601			r.height = box->y2 - box->y1;
2602			r.src = r.mask = r.dst;
2603
2604			op->prim_emit(sna, op, &r);
2605			box++;
2606		} while (--nbox_this_time);
2607	} while (nbox);
2608}
2609
2610static void
2611gen3_render_composite_boxes(struct sna *sna,
2612			    const struct sna_composite_op *op,
2613			    const BoxRec *box, int nbox)
2614{
2615	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
2616
2617	do {
2618		int nbox_this_time;
2619		float *v;
2620
2621		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
2622		assert(nbox_this_time);
2623		nbox -= nbox_this_time;
2624
2625		v = sna->render.vertices + sna->render.vertex_used;
2626		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
2627		assert(sna->render.vertex_used <= sna->render.vertex_size);
2628
2629		op->emit_boxes(op, box, nbox_this_time, v);
2630		box += nbox_this_time;
2631	} while (nbox);
2632}
2633
2634static void
2635gen3_render_composite_boxes__thread(struct sna *sna,
2636				    const struct sna_composite_op *op,
2637				    const BoxRec *box, int nbox)
2638{
2639	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
2640
2641	sna_vertex_lock(&sna->render);
2642	do {
2643		int nbox_this_time;
2644		float *v;
2645
2646		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
2647		assert(nbox_this_time);
2648		nbox -= nbox_this_time;
2649
2650		v = sna->render.vertices + sna->render.vertex_used;
2651		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
2652		assert(sna->render.vertex_used <= sna->render.vertex_size);
2653
2654		sna_vertex_acquire__locked(&sna->render);
2655		sna_vertex_unlock(&sna->render);
2656
2657		op->emit_boxes(op, box, nbox_this_time, v);
2658		box += nbox_this_time;
2659
2660		sna_vertex_lock(&sna->render);
2661		sna_vertex_release__locked(&sna->render);
2662	} while (nbox);
2663	sna_vertex_unlock(&sna->render);
2664}
2665
2666static void
2667gen3_render_composite_done(struct sna *sna,
2668			   const struct sna_composite_op *op)
2669{
2670	DBG(("%s()\n", __FUNCTION__));
2671
2672	if (sna->render.vertex_offset) {
2673		gen3_vertex_flush(sna);
2674		gen3_magic_ca_pass(sna, op);
2675	}
2676
2677	if (op->mask.bo)
2678		kgem_bo_destroy(&sna->kgem, op->mask.bo);
2679	if (op->src.bo)
2680		kgem_bo_destroy(&sna->kgem, op->src.bo);
2681
2682	sna_render_composite_redirect_done(sna, op);
2683}
2684
2685static void
2686discard_vbo(struct sna *sna)
2687{
2688	kgem_bo_destroy(&sna->kgem, sna->render.vbo);
2689	sna->render.vbo = NULL;
2690	sna->render.vertices = sna->render.vertex_data;
2691	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
2692	sna->render.vertex_used = 0;
2693	sna->render.vertex_index = 0;
2694}
2695
2696static void
2697gen3_render_reset(struct sna *sna)
2698{
2699	struct gen3_render_state *state = &sna->render_state.gen3;
2700
2701	state->need_invariant = true;
2702	state->current_dst = 0;
2703	state->tex_count = 0;
2704	state->last_drawrect_limit = ~0U;
2705	state->last_target = 0;
2706	state->last_blend = 0;
2707	state->last_constants = 0;
2708	state->last_sampler = 0;
2709	state->last_shader = 0x7fffffff;
2710	state->last_diffuse = 0xcc00ffee;
2711	state->last_specular = 0xcc00ffee;
2712
2713	state->floats_per_vertex = 0;
2714	state->last_floats_per_vertex = 0;
2715	state->last_vertex_offset = 0;
2716
2717	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
2718		DBG(("%s: discarding vbo as next access will stall: %lx\n",
2719		     __FUNCTION__, (long)sna->render.vbo->presumed_offset));
2720		discard_vbo(sna);
2721	}
2722
2723	sna->render.vertex_reloc[0] = 0;
2724	sna->render.vertex_offset = 0;
2725}
2726
2727static void
2728gen3_render_retire(struct kgem *kgem)
2729{
2730	struct sna *sna;
2731
2732	sna = container_of(kgem, struct sna, kgem);
2733	if (sna->render.vertex_reloc[0] == 0 &&
2734	    sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
2735		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
2736		sna->render.vertex_used = 0;
2737		sna->render.vertex_index = 0;
2738	}
2739}
2740
2741static void
2742gen3_render_expire(struct kgem *kgem)
2743{
2744	struct sna *sna;
2745
2746	sna = container_of(kgem, struct sna, kgem);
2747	if (sna->render.vbo && !sna->render.vertex_used) {
2748		DBG(("%s: discarding vbo\n", __FUNCTION__));
2749		discard_vbo(sna);
2750	}
2751}
2752
2753static bool gen3_composite_channel_set_format(struct sna_composite_channel *channel,
2754					      CARD32 format)
2755{
2756	unsigned int i;
2757
2758	for (i = 0; i < ARRAY_SIZE(gen3_tex_formats); i++) {
2759		if (gen3_tex_formats[i].fmt == format) {
2760			channel->card_format = gen3_tex_formats[i].card_fmt;
2761			channel->rb_reversed = gen3_tex_formats[i].rb_reversed;
2762			return true;
2763		}
2764	}
2765	return false;
2766}
2767
2768static bool source_is_covered(PicturePtr picture,
2769			      int x, int y,
2770			      int width, int height)
2771{
2772	int x1, y1, x2, y2;
2773
2774	if (picture->repeat && picture->repeatType != RepeatNone)
2775		return true;
2776
2777	if (picture->pDrawable == NULL)
2778		return false;
2779
2780	if (picture->transform) {
2781		pixman_box16_t sample;
2782
2783		sample.x1 = x;
2784		sample.y1 = y;
2785		sample.x2 = x + width;
2786		sample.y2 = y + height;
2787
2788		pixman_transform_bounds(picture->transform, &sample);
2789
2790		x1 = sample.x1;
2791		x2 = sample.x2;
2792		y1 = sample.y1;
2793		y2 = sample.y2;
2794	} else {
2795		x1 = x;
2796		y1 = y;
2797		x2 = x + width;
2798		y2 = y + height;
2799	}
2800
2801	return
2802		x1 >= 0 && y1 >= 0 &&
2803		x2 <= picture->pDrawable->width &&
2804		y2 <= picture->pDrawable->height;
2805}
2806
2807static bool gen3_composite_channel_set_xformat(PicturePtr picture,
2808					       struct sna_composite_channel *channel,
2809					       int x, int y,
2810					       int width, int height)
2811{
2812	unsigned int i;
2813
2814	if (PICT_FORMAT_A(picture->format) != 0)
2815		return false;
2816
2817	if (width == 0 || height == 0)
2818		return false;
2819
2820	if (!source_is_covered(picture, x, y, width, height))
2821		return false;
2822
2823	for (i = 0; i < ARRAY_SIZE(gen3_tex_formats); i++) {
2824		if (gen3_tex_formats[i].xfmt == picture->format) {
2825			channel->card_format = gen3_tex_formats[i].card_fmt;
2826			channel->rb_reversed = gen3_tex_formats[i].rb_reversed;
2827			channel->alpha_fixup = true;
2828			return true;
2829		}
2830	}
2831
2832	return false;
2833}
2834
2835static int
2836gen3_init_solid(struct sna_composite_channel *channel, uint32_t color)
2837{
2838	channel->u.gen3.mode = color;
2839	channel->u.gen3.type = SHADER_CONSTANT;
2840	if (color == 0)
2841		channel->u.gen3.type = SHADER_ZERO;
2842	else if (color == 0xff000000)
2843		channel->u.gen3.type = SHADER_BLACK;
2844	else if (color == 0xffffffff)
2845		channel->u.gen3.type = SHADER_WHITE;
2846
2847	channel->bo = NULL;
2848	channel->is_opaque = (color >> 24) == 0xff;
2849	channel->is_affine = 1;
2850	channel->alpha_fixup = 0;
2851	channel->rb_reversed = 0;
2852
2853	DBG(("%s: color=%08x, is_opaque=%d, type=%d\n",
2854	     __FUNCTION__, color, channel->is_opaque, channel->u.gen3.type));
2855
2856	/* for consistency */
2857	channel->repeat = RepeatNormal;
2858	channel->filter = PictFilterNearest;
2859	channel->pict_format = PICT_a8r8g8b8;
2860	channel->card_format = MAPSURF_32BIT | MT_32BIT_ARGB8888;
2861
2862	return 1;
2863}
2864
2865static void gen3_composite_channel_convert(struct sna_composite_channel *channel)
2866{
2867	if (channel->u.gen3.type == SHADER_TEXTURE)
2868		channel->repeat = gen3_texture_repeat(channel->repeat);
2869	else
2870		channel->repeat = gen3_gradient_repeat(channel->repeat);
2871
2872	channel->filter = gen3_filter(channel->filter);
2873	if (channel->card_format == 0)
2874		gen3_composite_channel_set_format(channel, channel->pict_format);
2875	assert(channel->card_format);
2876}
2877
2878static bool gen3_gradient_setup(struct sna *sna,
2879				PicturePtr picture,
2880				struct sna_composite_channel *channel,
2881				int16_t ox, int16_t oy)
2882{
2883	int16_t dx, dy;
2884
2885	if (picture->repeat == 0) {
2886		channel->repeat = RepeatNone;
2887	} else switch (picture->repeatType) {
2888	case RepeatNone:
2889	case RepeatNormal:
2890	case RepeatPad:
2891	case RepeatReflect:
2892		channel->repeat = picture->repeatType;
2893		break;
2894	default:
2895		return false;
2896	}
2897
2898	channel->bo =
2899		sna_render_get_gradient(sna,
2900					(PictGradient *)picture->pSourcePict);
2901	if (channel->bo == NULL)
2902		return false;
2903
2904	channel->pict_format = PICT_a8r8g8b8;
2905	channel->card_format = MAPSURF_32BIT | MT_32BIT_ARGB8888;
2906	channel->filter = PictFilterNearest;
2907	channel->is_affine = sna_transform_is_affine(picture->transform);
2908	if (sna_transform_is_imprecise_integer_translation(picture->transform, PictFilterNearest, false, &dx, &dy)) {
2909		DBG(("%s: integer translation (%d, %d), removing\n",
2910		     __FUNCTION__, dx, dy));
2911		ox += dx;
2912		oy += dy;
2913		channel->transform = NULL;
2914	} else
2915		channel->transform = picture->transform;
2916	channel->width  = channel->bo->pitch / 4;
2917	channel->height = 1;
2918	channel->offset[0] = ox;
2919	channel->offset[1] = oy;
2920	channel->scale[0] = channel->scale[1] = 1;
2921	return true;
2922}
2923
2924static int
2925gen3_init_linear(struct sna *sna,
2926		 PicturePtr picture,
2927		 struct sna_composite_op *op,
2928		 struct sna_composite_channel *channel,
2929		 int ox, int oy)
2930{
2931	PictLinearGradient *linear =
2932		(PictLinearGradient *)picture->pSourcePict;
2933	float x0, y0, sf;
2934	float dx, dy, offset;
2935	int n;
2936
2937	DBG(("%s: p1=(%f, %f), p2=(%f, %f)\n",
2938	     __FUNCTION__,
2939	     xFixedToDouble(linear->p1.x), xFixedToDouble(linear->p1.y),
2940	     xFixedToDouble(linear->p2.x), xFixedToDouble(linear->p2.y)));
2941
2942	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
2943		return 0;
2944
2945	dx = xFixedToDouble(linear->p2.x - linear->p1.x);
2946	dy = xFixedToDouble(linear->p2.y - linear->p1.y);
2947	sf = dx*dx + dy*dy;
2948	dx /= sf;
2949	dy /= sf;
2950
2951	x0 = xFixedToDouble(linear->p1.x);
2952	y0 = xFixedToDouble(linear->p1.y);
2953	offset = dx*x0 + dy*y0;
2954
2955	n = op->u.gen3.num_constants;
2956	channel->u.gen3.constants = FS_C0 + n / 4;
2957	op->u.gen3.constants[n++] = dx;
2958	op->u.gen3.constants[n++] = dy;
2959	op->u.gen3.constants[n++] = -offset;
2960	op->u.gen3.constants[n++] = 0;
2961
2962	if (!gen3_gradient_setup(sna, picture, channel, ox, oy))
2963		return -1;
2964
2965	channel->u.gen3.type = SHADER_LINEAR;
2966	op->u.gen3.num_constants = n;
2967
2968	DBG(("%s: dx=%f, dy=%f, offset=%f, constants=%d\n",
2969	     __FUNCTION__, dx, dy, -offset, channel->u.gen3.constants - FS_C0));
2970	return 1;
2971}
2972
2973static int
2974gen3_init_radial(struct sna *sna,
2975		 PicturePtr picture,
2976		 struct sna_composite_op *op,
2977		 struct sna_composite_channel *channel,
2978		 int ox, int oy)
2979{
2980	PictRadialGradient *radial = (PictRadialGradient *)picture->pSourcePict;
2981	double dx, dy, dr, r1;
2982	int n;
2983
2984	dx = xFixedToDouble(radial->c2.x - radial->c1.x);
2985	dy = xFixedToDouble(radial->c2.y - radial->c1.y);
2986	dr = xFixedToDouble(radial->c2.radius - radial->c1.radius);
2987
2988	r1 = xFixedToDouble(radial->c1.radius);
2989
2990	n = op->u.gen3.num_constants;
2991	channel->u.gen3.constants = FS_C0 + n / 4;
2992	if (radial->c2.x == radial->c1.x && radial->c2.y == radial->c1.y) {
2993		if (radial->c2.radius == radial->c1.radius) {
2994			channel->u.gen3.type = SHADER_ZERO;
2995			return 1;
2996		}
2997
2998		op->u.gen3.constants[n++] = xFixedToDouble(radial->c1.x) / dr;
2999		op->u.gen3.constants[n++] = xFixedToDouble(radial->c1.y) / dr;
3000		op->u.gen3.constants[n++] = 1. / dr;
3001		op->u.gen3.constants[n++] = -r1 / dr;
3002
3003		channel->u.gen3.mode = RADIAL_ONE;
3004	} else {
3005		op->u.gen3.constants[n++] = -xFixedToDouble(radial->c1.x);
3006		op->u.gen3.constants[n++] = -xFixedToDouble(radial->c1.y);
3007		op->u.gen3.constants[n++] = r1;
3008		op->u.gen3.constants[n++] = -4 * (dx*dx + dy*dy - dr*dr);
3009
3010		op->u.gen3.constants[n++] = -2 * dx;
3011		op->u.gen3.constants[n++] = -2 * dy;
3012		op->u.gen3.constants[n++] = -2 * r1 * dr;
3013		op->u.gen3.constants[n++] = 1 / (2 * (dx*dx + dy*dy - dr*dr));
3014
3015		channel->u.gen3.mode = RADIAL_TWO;
3016	}
3017
3018	if (!gen3_gradient_setup(sna, picture, channel, ox, oy))
3019		return -1;
3020
3021	channel->u.gen3.type = SHADER_RADIAL;
3022	op->u.gen3.num_constants = n;
3023	return 1;
3024}
3025
3026static bool
3027sna_picture_is_clear(PicturePtr picture,
3028		     int x, int y, int w, int h,
3029		     uint32_t *color)
3030{
3031	struct sna_pixmap *priv;
3032
3033	if (!picture->pDrawable)
3034		return false;
3035
3036	priv = sna_pixmap(get_drawable_pixmap(picture->pDrawable));
3037	if (priv == NULL || !priv->clear)
3038		return false;
3039
3040	if (!source_is_covered(picture, x, y, w, h))
3041		return false;
3042
3043	*color = priv->clear_color;
3044	return true;
3045}
3046
3047static int
3048gen3_composite_picture(struct sna *sna,
3049		       PicturePtr picture,
3050		       struct sna_composite_op *op,
3051		       struct sna_composite_channel *channel,
3052		       int16_t x, int16_t y,
3053		       int16_t w, int16_t h,
3054		       int16_t dst_x, int16_t dst_y,
3055		       bool precise)
3056{
3057	PixmapPtr pixmap;
3058	uint32_t color;
3059	int16_t dx, dy;
3060
3061	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
3062	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
3063
3064	channel->card_format = 0;
3065
3066	if (picture->pDrawable == NULL) {
3067		SourcePict *source = picture->pSourcePict;
3068		int ret = -1;
3069
3070		switch (source->type) {
3071		case SourcePictTypeSolidFill:
3072			DBG(("%s: solid fill [%08x], format %08x\n",
3073			     __FUNCTION__,
3074			     (unsigned)source->solidFill.color,
3075			     (unsigned)picture->format));
3076			ret = gen3_init_solid(channel, source->solidFill.color);
3077			break;
3078
3079		case SourcePictTypeLinear:
3080			ret = gen3_init_linear(sna, picture, op, channel,
3081					       x - dst_x, y - dst_y);
3082			break;
3083
3084		case SourcePictTypeRadial:
3085			ret = gen3_init_radial(sna, picture, op, channel,
3086					       x - dst_x, y - dst_y);
3087			break;
3088		}
3089
3090		if (ret == -1) {
3091			if (!precise)
3092				ret = sna_render_picture_approximate_gradient(sna, picture, channel,
3093									      x, y, w, h, dst_x, dst_y);
3094			if (ret == -1)
3095				ret = sna_render_picture_fixup(sna, picture, channel,
3096							       x, y, w, h, dst_x, dst_y);
3097		}
3098		return ret;
3099	}
3100
3101	if (picture->alphaMap) {
3102		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
3103		return sna_render_picture_fixup(sna, picture, channel,
3104						x, y, w, h, dst_x, dst_y);
3105	}
3106
3107	if (sna_picture_is_solid(picture, &color)) {
3108		DBG(("%s: solid drawable [%08x]\n", __FUNCTION__, color));
3109		return gen3_init_solid(channel, color);
3110	}
3111
3112	if (sna_picture_is_clear(picture, x, y, w, h, &color)) {
3113		DBG(("%s: clear drawable [%08x]\n", __FUNCTION__, color));
3114		return gen3_init_solid(channel, solid_color(picture->format, color));
3115	}
3116
3117	if (!gen3_check_repeat(picture))
3118		return sna_render_picture_fixup(sna, picture, channel,
3119						x, y, w, h, dst_x, dst_y);
3120
3121	if (!gen3_check_filter(picture))
3122		return sna_render_picture_fixup(sna, picture, channel,
3123						x, y, w, h, dst_x, dst_y);
3124
3125	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
3126	channel->filter = picture->filter;
3127	channel->pict_format = picture->format;
3128
3129	pixmap = get_drawable_pixmap(picture->pDrawable);
3130	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
3131
3132	x += dx + picture->pDrawable->x;
3133	y += dy + picture->pDrawable->y;
3134
3135	if (sna_transform_is_imprecise_integer_translation(picture->transform, picture->filter, precise, &dx, &dy)) {
3136		DBG(("%s: integer translation (%d, %d), removing\n",
3137		     __FUNCTION__, dx, dy));
3138		x += dx;
3139		y += dy;
3140		channel->transform = NULL;
3141		channel->filter = PictFilterNearest;
3142
3143		if (channel->repeat ||
3144		    (x >= 0 &&
3145		     y >= 0 &&
3146		     x + w <= pixmap->drawable.width &&
3147		     y + h <= pixmap->drawable.height)) {
3148			struct sna_pixmap *priv = sna_pixmap(pixmap);
3149			if (priv && priv->clear) {
3150				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
3151				return gen3_init_solid(channel, solid_color(picture->format, priv->clear_color));
3152			}
3153		}
3154	} else {
3155		channel->transform = picture->transform;
3156		channel->is_affine = sna_transform_is_affine(picture->transform);
3157	}
3158
3159	if (!gen3_composite_channel_set_format(channel, picture->format) &&
3160	    !gen3_composite_channel_set_xformat(picture, channel, x, y, w, h))
3161		return sna_render_picture_convert(sna, picture, channel, pixmap,
3162						  x, y, w, h, dst_x, dst_y,
3163						  false);
3164	assert(channel->card_format);
3165
3166	if (too_large(pixmap->drawable.width, pixmap->drawable.height)) {
3167		DBG(("%s: pixmap too large (%dx%d), extracting (%d, %d)x(%d,%d)\n",
3168		     __FUNCTION__,
3169		     pixmap->drawable.width, pixmap->drawable.height,
3170		     x, y, w, h));
3171		return sna_render_picture_extract(sna, picture, channel,
3172						  x, y, w, h, dst_x, dst_y);
3173	}
3174
3175	return sna_render_pixmap_bo(sna, channel, pixmap,
3176				    x, y, w, h, dst_x, dst_y);
3177}
3178
3179static void
3180gen3_align_vertex(struct sna *sna,
3181		  const struct sna_composite_op *op)
3182{
3183	int vertex_index;
3184
3185	if (op->floats_per_vertex == sna->render_state.gen3.last_floats_per_vertex)
3186		return;
3187
3188	DBG(("aligning vertex: was %d, now %d floats per vertex\n",
3189	     sna->render_state.gen3.last_floats_per_vertex,
3190	     op->floats_per_vertex));
3191
3192	assert(op->floats_per_rect == 3*op->floats_per_vertex);
3193
3194	vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
3195	if ((int)sna->render.vertex_size - vertex_index * op->floats_per_vertex < 2*op->floats_per_rect) {
3196		DBG(("%s: flushing vertex buffer: new index=%d, max=%d\n",
3197		     __FUNCTION__, vertex_index, sna->render.vertex_size / op->floats_per_vertex));
3198		if (gen3_vertex_finish(sna) < 2*op->floats_per_vertex)
3199			kgem_submit(&sna->kgem);
3200
3201		vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
3202	}
3203
3204	sna->render.vertex_index = vertex_index;
3205	sna->render.vertex_used = vertex_index * op->floats_per_vertex;
3206}
3207
3208static bool
3209gen3_composite_set_target(struct sna *sna,
3210			  struct sna_composite_op *op,
3211			  PicturePtr dst,
3212			  int x, int y, int w, int h,
3213			  bool partial)
3214{
3215	BoxRec box;
3216	unsigned hint;
3217
3218	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
3219	op->dst.format = dst->format;
3220	op->dst.width = op->dst.pixmap->drawable.width;
3221	op->dst.height = op->dst.pixmap->drawable.height;
3222
3223	if (w && h) {
3224		box.x1 = x;
3225		box.y1 = y;
3226		box.x2 = x + w;
3227		box.y2 = y + h;
3228	} else
3229		sna_render_picture_extents(dst, &box);
3230
3231	hint = PREFER_GPU | RENDER_GPU;
3232	if (!need_tiling(sna, op->dst.width, op->dst.height))
3233		hint |= FORCE_GPU;
3234	if (!partial) {
3235		hint |= IGNORE_DAMAGE;
3236		if (w == op->dst.width && h == op->dst.height)
3237			hint |= REPLACES;
3238	}
3239
3240	op->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint, &box, &op->damage);
3241	if (op->dst.bo == NULL)
3242		return false;
3243
3244	if (hint & REPLACES) {
3245		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
3246		kgem_bo_pair_undo(&sna->kgem, priv->gpu_bo, priv->cpu_bo);
3247	}
3248
3249	assert(op->dst.bo->unique_id);
3250
3251	/* For single-stream mode there should be no minimum alignment
3252	 * required, except that the width must be at least 2 elements.
3253	 * Furthermore, it appears that the pitch must be a multiple of
3254	 * 2 elements.
3255	 */
3256	if (op->dst.bo->pitch & ((2*op->dst.pixmap->drawable.bitsPerPixel >> 3) - 1))
3257		return false;
3258
3259	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
3260			    &op->dst.x, &op->dst.y);
3261
3262	DBG(("%s: pixmap=%ld, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
3263	     __FUNCTION__,
3264	     op->dst.pixmap->drawable.serialNumber, (int)op->dst.format,
3265	     op->dst.width, op->dst.height,
3266	     op->dst.bo->pitch,
3267	     op->dst.x, op->dst.y,
3268	     op->damage ? *op->damage : (void *)-1));
3269
3270	assert(op->dst.bo->proxy == NULL);
3271
3272	if ((too_large(op->dst.width, op->dst.height) ||
3273	     !gen3_check_pitch_3d(op->dst.bo)) &&
3274	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
3275		return false;
3276
3277	return true;
3278}
3279
3280static inline uint8_t
3281mul_8_8(uint8_t a, uint8_t b)
3282{
3283    uint16_t t = a * (uint16_t)b + 0x7f;
3284    return ((t >> 8) + t) >> 8;
3285}
3286
3287static inline uint32_t multa(uint32_t s, uint32_t m, int shift)
3288{
3289	return mul_8_8((s >> shift) & 0xff, m >> 24) << shift;
3290}
3291
3292static inline bool is_constant_ps(uint32_t type)
3293{
3294	switch (type) {
3295	case SHADER_NONE: /* be warned! */
3296	case SHADER_ZERO:
3297	case SHADER_BLACK:
3298	case SHADER_WHITE:
3299	case SHADER_CONSTANT:
3300		return true;
3301	default:
3302		return false;
3303	}
3304}
3305
3306static bool
3307has_alphamap(PicturePtr p)
3308{
3309	return p->alphaMap != NULL;
3310}
3311
3312static bool
3313need_upload(PicturePtr p)
3314{
3315	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
3316}
3317
3318static bool
3319source_is_busy(PixmapPtr pixmap)
3320{
3321	struct sna_pixmap *priv = sna_pixmap(pixmap);
3322	if (priv == NULL)
3323		return false;
3324
3325	if (priv->clear)
3326		return false;
3327
3328	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
3329		return true;
3330
3331	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
3332		return true;
3333
3334	return priv->gpu_damage && !priv->cpu_damage;
3335}
3336
3337static bool
3338is_unhandled_gradient(PicturePtr picture, bool precise)
3339{
3340	if (picture->pDrawable)
3341		return false;
3342
3343	switch (picture->pSourcePict->type) {
3344	case SourcePictTypeSolidFill:
3345	case SourcePictTypeLinear:
3346	case SourcePictTypeRadial:
3347		return false;
3348	default:
3349		return precise;
3350	}
3351}
3352
3353static bool
3354source_fallback(PicturePtr p, PixmapPtr pixmap, bool precise)
3355{
3356	if (sna_picture_is_solid(p, NULL))
3357		return false;
3358
3359	if (is_unhandled_gradient(p, precise))
3360		return true;
3361
3362	if (!gen3_check_xformat(p) || !gen3_check_repeat(p))
3363		return true;
3364
3365	if (pixmap && source_is_busy(pixmap))
3366		return false;
3367
3368	return has_alphamap(p) || !gen3_check_filter(p) || need_upload(p);
3369}
3370
3371static bool
3372gen3_composite_fallback(struct sna *sna,
3373			uint8_t op,
3374			PicturePtr src,
3375			PicturePtr mask,
3376			PicturePtr dst)
3377{
3378	PixmapPtr src_pixmap;
3379	PixmapPtr mask_pixmap;
3380	PixmapPtr dst_pixmap;
3381	bool src_fallback, mask_fallback;
3382
3383	if (!gen3_check_dst_format(dst->format)) {
3384		DBG(("%s: unknown destination format: %d\n",
3385		     __FUNCTION__, dst->format));
3386		return true;
3387	}
3388
3389	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
3390
3391	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
3392	src_fallback = source_fallback(src, src_pixmap,
3393				       dst->polyMode == PolyModePrecise);
3394
3395	if (mask) {
3396		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
3397		mask_fallback = source_fallback(mask, mask_pixmap,
3398						dst->polyMode == PolyModePrecise);
3399	} else {
3400		mask_pixmap = NULL;
3401		mask_fallback = false;
3402	}
3403
3404	/* If we are using the destination as a source and need to
3405	 * readback in order to upload the source, do it all
3406	 * on the cpu.
3407	 */
3408	if (src_pixmap == dst_pixmap && src_fallback) {
3409		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
3410		return true;
3411	}
3412	if (mask_pixmap == dst_pixmap && mask_fallback) {
3413		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
3414		return true;
3415	}
3416
3417	if (mask &&
3418	    mask->componentAlpha && PICT_FORMAT_RGB(mask->format) &&
3419	    gen3_blend_op[op].src_alpha &&
3420	    gen3_blend_op[op].src_blend != BLENDFACT_ZERO &&
3421	    op != PictOpOver) {
3422		DBG(("%s: component-alpha mask with op=%d, should fallback\n",
3423		     __FUNCTION__, op));
3424		return true;
3425	}
3426
3427	/* If anything is on the GPU, push everything out to the GPU */
3428	if (dst_use_gpu(dst_pixmap)) {
3429		DBG(("%s: dst is already on the GPU, try to use GPU\n",
3430		     __FUNCTION__));
3431		return false;
3432	}
3433
3434	if (src_pixmap && !src_fallback) {
3435		DBG(("%s: src is already on the GPU, try to use GPU\n",
3436		     __FUNCTION__));
3437		return false;
3438	}
3439	if (mask_pixmap && !mask_fallback) {
3440		DBG(("%s: mask is already on the GPU, try to use GPU\n",
3441		     __FUNCTION__));
3442		return false;
3443	}
3444
3445	/* However if the dst is not on the GPU and we need to
3446	 * render one of the sources using the CPU, we may
3447	 * as well do the entire operation in place onthe CPU.
3448	 */
3449	if (src_fallback) {
3450		DBG(("%s: dst is on the CPU and src will fallback\n",
3451		     __FUNCTION__));
3452		return true;
3453	}
3454
3455	if (mask && mask_fallback) {
3456		DBG(("%s: dst is on the CPU and mask will fallback\n",
3457		     __FUNCTION__));
3458		return true;
3459	}
3460
3461	if (too_large(dst_pixmap->drawable.width,
3462		      dst_pixmap->drawable.height) &&
3463	    dst_is_cpu(dst_pixmap)) {
3464		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
3465		return true;
3466	}
3467
3468	DBG(("%s: dst is not on the GPU and the operation should not fallback: use-cpu? %d\n",
3469	     __FUNCTION__, dst_use_cpu(dst_pixmap)));
3470	return dst_use_cpu(dst_pixmap);
3471}
3472
3473static int
3474reuse_source(struct sna *sna,
3475	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
3476	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
3477{
3478	if (src_x != msk_x || src_y != msk_y)
3479		return false;
3480
3481	if (mask == src) {
3482		*mc = *sc;
3483		if (mc->bo)
3484			kgem_bo_reference(mc->bo);
3485		return true;
3486	}
3487
3488	if ((src->pDrawable == NULL || mask->pDrawable != src->pDrawable))
3489		return false;
3490
3491	if (sc->is_solid)
3492		return false;
3493
3494	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
3495
3496	if (!sna_transform_equal(src->transform, mask->transform))
3497		return false;
3498
3499	if (!sna_picture_alphamap_equal(src, mask))
3500		return false;
3501
3502	if (!gen3_check_repeat(mask))
3503		return false;
3504
3505	if (!gen3_check_filter(mask))
3506		return false;
3507
3508	if (!gen3_check_format(mask))
3509		return false;
3510
3511	DBG(("%s: reusing source channel for mask with a twist\n",
3512	     __FUNCTION__));
3513
3514	*mc = *sc;
3515	mc->repeat = gen3_texture_repeat(mask->repeat ? mask->repeatType : RepeatNone);
3516	mc->filter = gen3_filter(mask->filter);
3517	mc->pict_format = mask->format;
3518	gen3_composite_channel_set_format(mc, mask->format);
3519	assert(mc->card_format);
3520	if (mc->bo)
3521		kgem_bo_reference(mc->bo);
3522	return true;
3523}
3524
3525static bool
3526gen3_render_composite(struct sna *sna,
3527		      uint8_t op,
3528		      PicturePtr src,
3529		      PicturePtr mask,
3530		      PicturePtr dst,
3531		      int16_t src_x,  int16_t src_y,
3532		      int16_t mask_x, int16_t mask_y,
3533		      int16_t dst_x,  int16_t dst_y,
3534		      int16_t width,  int16_t height,
3535		      unsigned flags,
3536		      struct sna_composite_op *tmp)
3537{
3538	DBG(("%s()\n", __FUNCTION__));
3539
3540	if (op >= ARRAY_SIZE(gen3_blend_op)) {
3541		DBG(("%s: fallback due to unhandled blend op: %d\n",
3542		     __FUNCTION__, op));
3543		return false;
3544	}
3545
3546	/* Try to use the BLT engine unless it implies a
3547	 * 3D -> 2D context switch.
3548	 */
3549	if (mask == NULL &&
3550	    sna_blt_composite(sna,
3551			      op, src, dst,
3552			      src_x, src_y,
3553			      dst_x, dst_y,
3554			      width, height,
3555			      flags, tmp))
3556		return true;
3557
3558	if (gen3_composite_fallback(sna, op, src, mask, dst))
3559		goto fallback;
3560
3561	if (need_tiling(sna, width, height))
3562		return sna_tiling_composite(op, src, mask, dst,
3563					    src_x,  src_y,
3564					    mask_x, mask_y,
3565					    dst_x,  dst_y,
3566					    width,  height,
3567					    tmp);
3568
3569	if (!gen3_composite_set_target(sna, tmp, dst,
3570				       dst_x, dst_y, width, height,
3571				       flags & COMPOSITE_PARTIAL || op > PictOpSrc)) {
3572		DBG(("%s: unable to set render target\n",
3573		     __FUNCTION__));
3574		goto fallback;
3575	}
3576
3577	tmp->op = op;
3578	tmp->rb_reversed = gen3_dst_rb_reversed(tmp->dst.format);
3579	tmp->u.gen3.num_constants = 0;
3580	tmp->src.u.gen3.type = SHADER_TEXTURE;
3581	tmp->src.is_affine = true;
3582	DBG(("%s: preparing source\n", __FUNCTION__));
3583	switch (gen3_composite_picture(sna, src, tmp, &tmp->src,
3584				       src_x, src_y,
3585				       width, height,
3586				       dst_x, dst_y,
3587				       dst->polyMode == PolyModePrecise)) {
3588	case -1:
3589		goto cleanup_dst;
3590	case 0:
3591		tmp->src.u.gen3.type = SHADER_ZERO;
3592		break;
3593	case 1:
3594		if (mask == NULL && tmp->src.bo &&
3595		    sna_blt_composite__convert(sna,
3596					       dst_x, dst_y, width, height,
3597					       tmp))
3598			return true;
3599
3600		gen3_composite_channel_convert(&tmp->src);
3601		break;
3602	}
3603	DBG(("%s: source type=%d\n", __FUNCTION__, tmp->src.u.gen3.type));
3604
3605	tmp->mask.u.gen3.type = SHADER_NONE;
3606	tmp->mask.is_affine = true;
3607	tmp->need_magic_ca_pass = false;
3608	tmp->has_component_alpha = false;
3609	if (mask && tmp->src.u.gen3.type != SHADER_ZERO) {
3610		if (!reuse_source(sna,
3611				  src, &tmp->src, src_x, src_y,
3612				  mask, &tmp->mask, mask_x, mask_y)) {
3613			tmp->mask.u.gen3.type = SHADER_TEXTURE;
3614			DBG(("%s: preparing mask\n", __FUNCTION__));
3615			switch (gen3_composite_picture(sna, mask, tmp, &tmp->mask,
3616						       mask_x, mask_y,
3617						       width,  height,
3618						       dst_x,  dst_y,
3619						       dst->polyMode == PolyModePrecise)) {
3620			case -1:
3621				goto cleanup_src;
3622			case 0:
3623				tmp->mask.u.gen3.type = SHADER_ZERO;
3624				break;
3625			case 1:
3626				gen3_composite_channel_convert(&tmp->mask);
3627				break;
3628			}
3629		}
3630		DBG(("%s: mask type=%d\n", __FUNCTION__, tmp->mask.u.gen3.type));
3631		if (tmp->mask.u.gen3.type == SHADER_ZERO) {
3632			if (tmp->src.bo) {
3633				kgem_bo_destroy(&sna->kgem,
3634						tmp->src.bo);
3635				tmp->src.bo = NULL;
3636			}
3637			tmp->src.u.gen3.type = SHADER_ZERO;
3638			tmp->mask.u.gen3.type = SHADER_NONE;
3639		}
3640
3641		if (tmp->mask.u.gen3.type != SHADER_NONE) {
3642			if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
3643				/* Check if it's component alpha that relies on a source alpha
3644				 * and on the source value.  We can only get one of those
3645				 * into the single source value that we get to blend with.
3646				 */
3647				DBG(("%s: component-alpha mask: %d\n",
3648				     __FUNCTION__, tmp->mask.u.gen3.type));
3649				tmp->has_component_alpha = true;
3650				if (tmp->mask.u.gen3.type == SHADER_WHITE) {
3651					tmp->mask.u.gen3.type = SHADER_NONE;
3652					tmp->has_component_alpha = false;
3653				} else if (gen3_blend_op[op].src_alpha &&
3654					   gen3_blend_op[op].src_blend != BLENDFACT_ZERO) {
3655					if (op != PictOpOver)
3656						goto cleanup_mask;
3657
3658					tmp->need_magic_ca_pass = true;
3659					tmp->op = PictOpOutReverse;
3660				}
3661			} else {
3662				if (tmp->mask.is_opaque) {
3663					tmp->mask.u.gen3.type = SHADER_NONE;
3664				} else if (is_constant_ps(tmp->src.u.gen3.type) &&
3665					   is_constant_ps(tmp->mask.u.gen3.type)) {
3666					uint32_t v;
3667
3668					v = multa(tmp->src.u.gen3.mode,
3669						  tmp->mask.u.gen3.mode,
3670						  24);
3671					v |= multa(tmp->src.u.gen3.mode,
3672						   tmp->mask.u.gen3.mode,
3673						   16);
3674					v |= multa(tmp->src.u.gen3.mode,
3675						   tmp->mask.u.gen3.mode,
3676						   8);
3677					v |= multa(tmp->src.u.gen3.mode,
3678						   tmp->mask.u.gen3.mode,
3679						   0);
3680
3681					DBG(("%s: combining constant source/mask: %x x %x -> %x\n",
3682					     __FUNCTION__,
3683					     tmp->src.u.gen3.mode,
3684					     tmp->mask.u.gen3.mode,
3685					     v));
3686
3687					tmp->src.u.gen3.type = SHADER_CONSTANT;
3688					tmp->src.u.gen3.mode = v;
3689					tmp->src.is_opaque = false;
3690
3691					tmp->mask.u.gen3.type = SHADER_NONE;
3692				}
3693			}
3694		}
3695	}
3696	DBG(("%s: final src/mask type=%d/%d [constant? %d/%d], transform? %d/%d, affine=%d/%d\n", __FUNCTION__,
3697	     tmp->src.u.gen3.type, tmp->mask.u.gen3.type,
3698	     is_constant_ps(tmp->src.u.gen3.type),
3699	     is_constant_ps(tmp->mask.u.gen3.type),
3700	     !!tmp->src.transform, !!tmp->mask.transform,
3701	     tmp->src.is_affine, tmp->mask.is_affine));
3702
3703	tmp->prim_emit = gen3_emit_composite_primitive;
3704	if (is_constant_ps(tmp->mask.u.gen3.type)) {
3705		switch (tmp->src.u.gen3.type) {
3706		case SHADER_NONE:
3707		case SHADER_ZERO:
3708		case SHADER_BLACK:
3709		case SHADER_WHITE:
3710		case SHADER_CONSTANT:
3711#if defined(sse2) && !defined(__x86_64__)
3712			if (sna->cpu_features & SSE2) {
3713				tmp->prim_emit = gen3_emit_composite_primitive_constant__sse2;
3714				tmp->emit_boxes = gen3_emit_composite_boxes_constant__sse2;
3715			} else
3716#endif
3717			{
3718				tmp->prim_emit = gen3_emit_composite_primitive_constant;
3719				tmp->emit_boxes = gen3_emit_composite_boxes_constant;
3720			}
3721
3722			break;
3723		case SHADER_LINEAR:
3724		case SHADER_RADIAL:
3725			if (tmp->src.transform == NULL) {
3726#if defined(sse2) && !defined(__x86_64__)
3727				if (sna->cpu_features & SSE2) {
3728					tmp->prim_emit = gen3_emit_composite_primitive_identity_gradient__sse2;
3729					tmp->emit_boxes = gen3_emit_composite_boxes_identity_gradient__sse2;
3730				} else
3731#endif
3732				{
3733					tmp->prim_emit = gen3_emit_composite_primitive_identity_gradient;
3734					tmp->emit_boxes = gen3_emit_composite_boxes_identity_gradient;
3735				}
3736			} else if (tmp->src.is_affine) {
3737				tmp->src.scale[1] = tmp->src.scale[0] = 1. / tmp->src.transform->matrix[2][2];
3738#if defined(sse2) && !defined(__x86_64__)
3739				if (sna->cpu_features & SSE2) {
3740					tmp->prim_emit = gen3_emit_composite_primitive_affine_gradient__sse2;
3741					tmp->emit_boxes = gen3_emit_composite_boxes_affine_gradient__sse2;
3742				} else
3743#endif
3744				{
3745					tmp->prim_emit = gen3_emit_composite_primitive_affine_gradient;
3746					tmp->emit_boxes = gen3_emit_composite_boxes_affine_gradient;
3747				}
3748			}
3749			break;
3750		case SHADER_TEXTURE:
3751			if (tmp->src.transform == NULL) {
3752				if ((tmp->src.offset[0]|tmp->src.offset[1]|tmp->dst.x|tmp->dst.y) == 0) {
3753#if defined(sse2) && !defined(__x86_64__)
3754					if (sna->cpu_features & SSE2) {
3755						tmp->prim_emit = gen3_emit_composite_primitive_identity_source_no_offset__sse2;
3756						tmp->emit_boxes = gen3_emit_composite_boxes_identity_source_no_offset__sse2;
3757					} else
3758#endif
3759					{
3760						tmp->prim_emit = gen3_emit_composite_primitive_identity_source_no_offset;
3761						tmp->emit_boxes = gen3_emit_composite_boxes_identity_source_no_offset;
3762					}
3763				} else {
3764#if defined(sse2) && !defined(__x86_64__)
3765					if (sna->cpu_features & SSE2) {
3766						tmp->prim_emit = gen3_emit_composite_primitive_identity_source__sse2;
3767						tmp->emit_boxes = gen3_emit_composite_boxes_identity_source__sse2;
3768					} else
3769#endif
3770					{
3771						tmp->prim_emit = gen3_emit_composite_primitive_identity_source;
3772						tmp->emit_boxes = gen3_emit_composite_boxes_identity_source;
3773					}
3774				}
3775			} else if (tmp->src.is_affine) {
3776				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
3777				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
3778#if defined(sse2) && !defined(__x86_64__)
3779				if (sna->cpu_features & SSE2) {
3780					tmp->prim_emit = gen3_emit_composite_primitive_affine_source__sse2;
3781					tmp->emit_boxes = gen3_emit_composite_boxes_affine_source__sse2;
3782				} else
3783#endif
3784				{
3785					tmp->prim_emit = gen3_emit_composite_primitive_affine_source;
3786					tmp->emit_boxes = gen3_emit_composite_boxes_affine_source;
3787				}
3788			}
3789			break;
3790		}
3791	} else if (tmp->mask.u.gen3.type == SHADER_TEXTURE) {
3792		if (tmp->mask.transform == NULL) {
3793			if (is_constant_ps(tmp->src.u.gen3.type)) {
3794				if ((tmp->mask.offset[0]|tmp->mask.offset[1]|tmp->dst.x|tmp->dst.y) == 0) {
3795#if defined(sse2) && !defined(__x86_64__)
3796					if (sna->cpu_features & SSE2) {
3797						tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask_no_offset__sse2;
3798					} else
3799#endif
3800					{
3801						tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask_no_offset;
3802					}
3803				} else {
3804#if defined(sse2) && !defined(__x86_64__)
3805					if (sna->cpu_features & SSE2) {
3806						tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask__sse2;
3807					} else
3808#endif
3809					{
3810						tmp->prim_emit = gen3_emit_composite_primitive_constant_identity_mask;
3811					}
3812				}
3813			} else if (tmp->src.transform == NULL) {
3814#if defined(sse2) && !defined(__x86_64__)
3815				if (sna->cpu_features & SSE2) {
3816					tmp->prim_emit = gen3_emit_composite_primitive_identity_source_mask__sse2;
3817				} else
3818#endif
3819				{
3820					tmp->prim_emit = gen3_emit_composite_primitive_identity_source_mask;
3821				}
3822			} else if (tmp->src.is_affine) {
3823				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
3824				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
3825#if defined(sse2) && !defined(__x86_64__)
3826				if (sna->cpu_features & SSE2) {
3827					tmp->prim_emit = gen3_emit_composite_primitive_affine_source_mask__sse2;
3828				} else
3829#endif
3830				{
3831					tmp->prim_emit = gen3_emit_composite_primitive_affine_source_mask;
3832				}
3833			}
3834		}
3835	}
3836
3837	tmp->floats_per_vertex = 2;
3838	if (!is_constant_ps(tmp->src.u.gen3.type))
3839		tmp->floats_per_vertex += tmp->src.is_affine ? 2 : 4;
3840	if (!is_constant_ps(tmp->mask.u.gen3.type))
3841		tmp->floats_per_vertex += tmp->mask.is_affine ? 2 : 4;
3842	DBG(("%s: floats_per_vertex = 2 + %d + %d = %d [specialised emitter? %d]\n", __FUNCTION__,
3843	     !is_constant_ps(tmp->src.u.gen3.type) ? tmp->src.is_affine ? 2 : 4 : 0,
3844	     !is_constant_ps(tmp->mask.u.gen3.type) ? tmp->mask.is_affine ? 2 : 4 : 0,
3845	     tmp->floats_per_vertex,
3846	     tmp->prim_emit != gen3_emit_composite_primitive));
3847	tmp->floats_per_rect = 3 * tmp->floats_per_vertex;
3848
3849	tmp->blt   = gen3_render_composite_blt;
3850	tmp->box   = gen3_render_composite_box;
3851	tmp->boxes = gen3_render_composite_boxes__blt;
3852	if (tmp->emit_boxes) {
3853		tmp->boxes = gen3_render_composite_boxes;
3854		tmp->thread_boxes = gen3_render_composite_boxes__thread;
3855	}
3856	tmp->done  = gen3_render_composite_done;
3857
3858	if (!kgem_check_bo(&sna->kgem,
3859			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
3860			   NULL)) {
3861		kgem_submit(&sna->kgem);
3862		if (!kgem_check_bo(&sna->kgem,
3863				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
3864				   NULL))
3865			goto cleanup_mask;
3866	}
3867
3868	gen3_align_vertex(sna, tmp);
3869	gen3_emit_composite_state(sna, tmp);
3870	return true;
3871
3872cleanup_mask:
3873	if (tmp->mask.bo) {
3874		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
3875		tmp->mask.bo = NULL;
3876	}
3877cleanup_src:
3878	if (tmp->src.bo) {
3879		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
3880		tmp->src.bo = NULL;
3881	}
3882cleanup_dst:
3883	if (tmp->redirect.real_bo) {
3884		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
3885		tmp->redirect.real_bo = NULL;
3886	}
3887fallback:
3888	return (mask == NULL &&
3889		sna_blt_composite(sna,
3890				  op, src, dst,
3891				  src_x, src_y,
3892				  dst_x, dst_y,
3893				  width, height,
3894				  flags | COMPOSITE_FALLBACK, tmp));
3895}
3896
3897static void
3898gen3_emit_composite_spans_vertex(struct sna *sna,
3899				 const struct sna_composite_spans_op *op,
3900				 int16_t x, int16_t y,
3901				 float opacity)
3902{
3903	gen3_emit_composite_dstcoord(sna, x + op->base.dst.x, y + op->base.dst.y);
3904	gen3_emit_composite_texcoord(sna, &op->base.src, x, y);
3905	OUT_VERTEX(opacity);
3906}
3907
3908fastcall static void
3909gen3_emit_composite_spans_primitive_zero(struct sna *sna,
3910					 const struct sna_composite_spans_op *op,
3911					 const BoxRec *box,
3912					 float opacity)
3913{
3914	float *v = sna->render.vertices + sna->render.vertex_used;
3915	sna->render.vertex_used += 6;
3916	assert(sna->render.vertex_used <= sna->render.vertex_size);
3917
3918	v[0] = op->base.dst.x + box->x2;
3919	v[1] = op->base.dst.y + box->y2;
3920
3921	v[2] = op->base.dst.x + box->x1;
3922	v[3] = v[1];
3923
3924	v[4] = v[2];
3925	v[5] = op->base.dst.x + box->y1;
3926}
3927
3928fastcall static void
3929gen3_emit_composite_spans_primitive_zero__boxes(const struct sna_composite_spans_op *op,
3930						const struct sna_opacity_box *b,
3931						int nbox, float *v)
3932{
3933	do {
3934		v[0] = op->base.dst.x + b->box.x2;
3935		v[1] = op->base.dst.y + b->box.y2;
3936
3937		v[2] = op->base.dst.x + b->box.x1;
3938		v[3] = v[1];
3939
3940		v[4] = v[2];
3941		v[5] = op->base.dst.x + b->box.y1;
3942
3943		v += 6;
3944		b++;
3945	} while (--nbox);
3946}
3947
3948fastcall static void
3949gen3_emit_composite_spans_primitive_zero_no_offset(struct sna *sna,
3950						   const struct sna_composite_spans_op *op,
3951						   const BoxRec *box,
3952						   float opacity)
3953{
3954	float *v = sna->render.vertices + sna->render.vertex_used;
3955	sna->render.vertex_used += 6;
3956	assert(sna->render.vertex_used <= sna->render.vertex_size);
3957
3958	v[0] = box->x2;
3959	v[3] = v[1] = box->y2;
3960	v[4] = v[2] = box->x1;
3961	v[5] = box->y1;
3962}
3963
3964fastcall static void
3965gen3_emit_composite_spans_primitive_zero_no_offset__boxes(const struct sna_composite_spans_op *op,
3966							  const struct sna_opacity_box *b,
3967							  int nbox, float *v)
3968{
3969	do {
3970		v[0] = b->box.x2;
3971		v[3] = v[1] = b->box.y2;
3972		v[4] = v[2] = b->box.x1;
3973		v[5] = b->box.y1;
3974
3975		b++;
3976		v += 6;
3977	} while (--nbox);
3978}
3979
3980fastcall static void
3981gen3_emit_composite_spans_primitive_constant(struct sna *sna,
3982					     const struct sna_composite_spans_op *op,
3983					     const BoxRec *box,
3984					     float opacity)
3985{
3986	float *v = sna->render.vertices + sna->render.vertex_used;
3987	sna->render.vertex_used += 9;
3988	assert(sna->render.vertex_used <= sna->render.vertex_size);
3989
3990	v[0] = op->base.dst.x + box->x2;
3991	v[6] = v[3] = op->base.dst.x + box->x1;
3992	v[4] = v[1] = op->base.dst.y + box->y2;
3993	v[7] = op->base.dst.y + box->y1;
3994	v[8] = v[5] = v[2] = opacity;
3995}
3996
3997fastcall static void
3998gen3_emit_composite_spans_primitive_constant__boxes(const struct sna_composite_spans_op *op,
3999						    const struct sna_opacity_box *b,
4000						    int nbox,
4001						    float *v)
4002{
4003	do {
4004		v[0] = op->base.dst.x + b->box.x2;
4005		v[6] = v[3] = op->base.dst.x + b->box.x1;
4006		v[4] = v[1] = op->base.dst.y + b->box.y2;
4007		v[7] = op->base.dst.y + b->box.y1;
4008		v[8] = v[5] = v[2] = b->alpha;
4009
4010		v += 9;
4011		b++;
4012	} while (--nbox);
4013}
4014
4015fastcall static void
4016gen3_emit_composite_spans_primitive_constant_no_offset(struct sna *sna,
4017						       const struct sna_composite_spans_op *op,
4018						       const BoxRec *box,
4019						       float opacity)
4020{
4021	float *v = sna->render.vertices + sna->render.vertex_used;
4022	sna->render.vertex_used += 9;
4023	assert(sna->render.vertex_used <= sna->render.vertex_size);
4024
4025	v[0] = box->x2;
4026	v[6] = v[3] = box->x1;
4027	v[4] = v[1] = box->y2;
4028	v[7] = box->y1;
4029	v[8] = v[5] = v[2] = opacity;
4030}
4031
4032fastcall static void
4033gen3_emit_composite_spans_primitive_constant_no_offset__boxes(const struct sna_composite_spans_op *op,
4034							      const struct sna_opacity_box *b,
4035							      int nbox, float *v)
4036{
4037	do {
4038		v[0] = b->box.x2;
4039		v[6] = v[3] = b->box.x1;
4040		v[4] = v[1] = b->box.y2;
4041		v[7] = b->box.y1;
4042		v[8] = v[5] = v[2] = b->alpha;
4043
4044		v += 9;
4045		b++;
4046	} while (--nbox);
4047}
4048
4049fastcall static void
4050gen3_emit_composite_spans_primitive_identity_source(struct sna *sna,
4051						    const struct sna_composite_spans_op *op,
4052						    const BoxRec *box,
4053						    float opacity)
4054{
4055	float *v = sna->render.vertices + sna->render.vertex_used;
4056	sna->render.vertex_used += 15;
4057	assert(sna->render.vertex_used <= sna->render.vertex_size);
4058
4059	v[0] = op->base.dst.x + box->x2;
4060	v[1] = op->base.dst.y + box->y2;
4061	v[2] = (op->base.src.offset[0] + box->x2) * op->base.src.scale[0];
4062	v[3] = (op->base.src.offset[1] + box->y2) * op->base.src.scale[1];
4063	v[4] = opacity;
4064
4065	v[5] = op->base.dst.x + box->x1;
4066	v[6] = v[1];
4067	v[7] = (op->base.src.offset[0] + box->x1) * op->base.src.scale[0];
4068	v[8] = v[3];
4069	v[9] = opacity;
4070
4071	v[10] = v[5];
4072	v[11] = op->base.dst.y + box->y1;
4073	v[12] = v[7];
4074	v[13] = (op->base.src.offset[1] + box->y1) * op->base.src.scale[1];
4075	v[14] = opacity;
4076}
4077
4078fastcall static void
4079gen3_emit_composite_spans_primitive_identity_source__boxes(const struct sna_composite_spans_op *op,
4080							   const struct sna_opacity_box *b,
4081							   int nbox,
4082							   float *v)
4083{
4084	do {
4085		v[0] = op->base.dst.x + b->box.x2;
4086		v[1] = op->base.dst.y + b->box.y2;
4087		v[2] = (op->base.src.offset[0] + b->box.x2) * op->base.src.scale[0];
4088		v[3] = (op->base.src.offset[1] + b->box.y2) * op->base.src.scale[1];
4089		v[4] = b->alpha;
4090
4091		v[5] = op->base.dst.x + b->box.x1;
4092		v[6] = v[1];
4093		v[7] = (op->base.src.offset[0] + b->box.x1) * op->base.src.scale[0];
4094		v[8] = v[3];
4095		v[9] = b->alpha;
4096
4097		v[10] = v[5];
4098		v[11] = op->base.dst.y + b->box.y1;
4099		v[12] = v[7];
4100		v[13] = (op->base.src.offset[1] + b->box.y1) * op->base.src.scale[1];
4101		v[14] = b->alpha;
4102
4103		v += 15;
4104		b++;
4105	} while (--nbox);
4106}
4107
4108fastcall static void
4109gen3_emit_composite_spans_primitive_affine_source(struct sna *sna,
4110						  const struct sna_composite_spans_op *op,
4111						  const BoxRec *box,
4112						  float opacity)
4113{
4114	PictTransform *transform = op->base.src.transform;
4115	float *v;
4116
4117	v = sna->render.vertices + sna->render.vertex_used;
4118	sna->render.vertex_used += 15;
4119	assert(sna->render.vertex_used <= sna->render.vertex_size);
4120
4121	v[0]  = op->base.dst.x + box->x2;
4122	v[6]  = v[1] = op->base.dst.y + box->y2;
4123	v[10] = v[5] = op->base.dst.x + box->x1;
4124	v[11] = op->base.dst.y + box->y1;
4125	v[14] = v[9] = v[4]  = opacity;
4126
4127	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
4128				    (int)op->base.src.offset[1] + box->y2,
4129				    transform, op->base.src.scale,
4130				    &v[2], &v[3]);
4131
4132	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
4133				    (int)op->base.src.offset[1] + box->y2,
4134				    transform, op->base.src.scale,
4135				    &v[7], &v[8]);
4136
4137	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
4138				    (int)op->base.src.offset[1] + box->y1,
4139				    transform, op->base.src.scale,
4140				    &v[12], &v[13]);
4141}
4142
4143fastcall static void
4144gen3_emit_composite_spans_primitive_affine_source__boxes(const struct sna_composite_spans_op *op,
4145							 const struct sna_opacity_box *b,
4146							 int nbox,
4147							 float *v)
4148{
4149	PictTransform *transform = op->base.src.transform;
4150
4151	do {
4152		v[0]  = op->base.dst.x + b->box.x2;
4153		v[6]  = v[1] = op->base.dst.y + b->box.y2;
4154		v[10] = v[5] = op->base.dst.x + b->box.x1;
4155		v[11] = op->base.dst.y + b->box.y1;
4156		v[14] = v[9] = v[4]  = b->alpha;
4157
4158		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x2,
4159					    (int)op->base.src.offset[1] + b->box.y2,
4160					    transform, op->base.src.scale,
4161					    &v[2], &v[3]);
4162
4163		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
4164					    (int)op->base.src.offset[1] + b->box.y2,
4165					    transform, op->base.src.scale,
4166					    &v[7], &v[8]);
4167
4168		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
4169					    (int)op->base.src.offset[1] + b->box.y1,
4170					    transform, op->base.src.scale,
4171					    &v[12], &v[13]);
4172		v += 15;
4173		b++;
4174	} while (--nbox);
4175}
4176
4177fastcall static void
4178gen3_emit_composite_spans_primitive_identity_gradient(struct sna *sna,
4179						      const struct sna_composite_spans_op *op,
4180						      const BoxRec *box,
4181						      float opacity)
4182{
4183	float *v = sna->render.vertices + sna->render.vertex_used;
4184	sna->render.vertex_used += 15;
4185	assert(sna->render.vertex_used <= sna->render.vertex_size);
4186
4187	v[0] = op->base.dst.x + box->x2;
4188	v[1] = op->base.dst.y + box->y2;
4189	v[2] = op->base.src.offset[0] + box->x2;
4190	v[3] = op->base.src.offset[1] + box->y2;
4191	v[4] = opacity;
4192
4193	v[5] = op->base.dst.x + box->x1;
4194	v[6] = v[1];
4195	v[7] = op->base.src.offset[0] + box->x1;
4196	v[8] = v[3];
4197	v[9] = opacity;
4198
4199	v[10] = v[5];
4200	v[11] = op->base.dst.y + box->y1;
4201	v[12] = v[7];
4202	v[13] = op->base.src.offset[1] + box->y1;
4203	v[14] = opacity;
4204}
4205
4206fastcall static void
4207gen3_emit_composite_spans_primitive_identity_gradient__boxes(const struct sna_composite_spans_op *op,
4208							     const struct sna_opacity_box *b,
4209							     int nbox,
4210							     float *v)
4211{
4212	do {
4213		v[0] = op->base.dst.x + b->box.x2;
4214		v[1] = op->base.dst.y + b->box.y2;
4215		v[2] = op->base.src.offset[0] + b->box.x2;
4216		v[3] = op->base.src.offset[1] + b->box.y2;
4217		v[4] = b->alpha;
4218
4219		v[5] = op->base.dst.x + b->box.x1;
4220		v[6] = v[1];
4221		v[7] = op->base.src.offset[0] + b->box.x1;
4222		v[8] = v[3];
4223		v[9] = b->alpha;
4224
4225		v[10] = v[5];
4226		v[11] = op->base.dst.y + b->box.y1;
4227		v[12] = v[7];
4228		v[13] = op->base.src.offset[1] + b->box.y1;
4229		v[14] = b->alpha;
4230
4231		v += 15;
4232		b++;
4233	} while (--nbox);
4234}
4235
4236#if defined(sse2) && !defined(__x86_64__)
4237sse2 fastcall static void
4238gen3_emit_composite_spans_primitive_constant__sse2(struct sna *sna,
4239						   const struct sna_composite_spans_op *op,
4240						   const BoxRec *box,
4241						   float opacity)
4242{
4243	float *v = sna->render.vertices + sna->render.vertex_used;
4244	sna->render.vertex_used += 9;
4245	assert(sna->render.vertex_used <= sna->render.vertex_size);
4246
4247	v[0] = op->base.dst.x + box->x2;
4248	v[6] = v[3] = op->base.dst.x + box->x1;
4249	v[4] = v[1] = op->base.dst.y + box->y2;
4250	v[7] = op->base.dst.y + box->y1;
4251	v[8] = v[5] = v[2] = opacity;
4252}
4253
4254sse2 fastcall static void
4255gen3_emit_composite_spans_primitive_constant__sse2__boxes(const struct sna_composite_spans_op *op,
4256							  const struct sna_opacity_box *b,
4257							  int nbox,
4258							  float *v)
4259{
4260	do {
4261		v[0] = op->base.dst.x + b->box.x2;
4262		v[6] = v[3] = op->base.dst.x + b->box.x1;
4263		v[4] = v[1] = op->base.dst.y + b->box.y2;
4264		v[7] = op->base.dst.y + b->box.y1;
4265		v[8] = v[5] = v[2] = b->alpha;
4266
4267		v += 9;
4268		b++;
4269	} while (--nbox);
4270}
4271
4272sse2 fastcall static void
4273gen3_render_composite_spans_constant_box__sse2(struct sna *sna,
4274					       const struct sna_composite_spans_op *op,
4275					       const BoxRec *box, float opacity)
4276{
4277	float *v;
4278	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
4279	     __FUNCTION__,
4280	     op->base.src.offset[0], op->base.src.offset[1],
4281	     opacity,
4282	     op->base.dst.x, op->base.dst.y,
4283	     box->x1, box->y1,
4284	     box->x2 - box->x1,
4285	     box->y2 - box->y1));
4286
4287	gen3_get_rectangles(sna, &op->base, 1);
4288
4289	v = sna->render.vertices + sna->render.vertex_used;
4290	sna->render.vertex_used += 9;
4291	assert(sna->render.vertex_used <= sna->render.vertex_size);
4292
4293	v[0] = box->x2;
4294	v[6] = v[3] = box->x1;
4295	v[4] = v[1] = box->y2;
4296	v[7] = box->y1;
4297	v[8] = v[5] = v[2] = opacity;
4298}
4299
4300sse2 fastcall static void
4301gen3_render_composite_spans_constant_thread__sse2__boxes(struct sna *sna,
4302							 const struct sna_composite_spans_op *op,
4303							 const struct sna_opacity_box *box,
4304							 int nbox)
4305{
4306	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
4307	     __FUNCTION__, nbox,
4308	     op->base.src.offset[0], op->base.src.offset[1],
4309	     op->base.dst.x, op->base.dst.y));
4310
4311	sna_vertex_lock(&sna->render);
4312	do {
4313		int nbox_this_time;
4314		float *v;
4315
4316		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
4317		assert(nbox_this_time);
4318		nbox -= nbox_this_time;
4319
4320		v = sna->render.vertices + sna->render.vertex_used;
4321		sna->render.vertex_used += nbox_this_time * 9;
4322		assert(sna->render.vertex_used <= sna->render.vertex_size);
4323
4324		sna_vertex_acquire__locked(&sna->render);
4325		sna_vertex_unlock(&sna->render);
4326
4327		do {
4328			v[0] = box->box.x2;
4329			v[6] = v[3] = box->box.x1;
4330			v[4] = v[1] = box->box.y2;
4331			v[7] = box->box.y1;
4332			v[8] = v[5] = v[2] = box->alpha;
4333			v += 9;
4334			box++;
4335		} while (--nbox_this_time);
4336
4337		sna_vertex_lock(&sna->render);
4338		sna_vertex_release__locked(&sna->render);
4339	} while (nbox);
4340	sna_vertex_unlock(&sna->render);
4341}
4342
4343sse2 fastcall static void
4344gen3_emit_composite_spans_primitive_constant__sse2__no_offset(struct sna *sna,
4345							      const struct sna_composite_spans_op *op,
4346							      const BoxRec *box,
4347							      float opacity)
4348{
4349	float *v = sna->render.vertices + sna->render.vertex_used;
4350	sna->render.vertex_used += 9;
4351	assert(sna->render.vertex_used <= sna->render.vertex_size);
4352
4353	v[0] = box->x2;
4354	v[6] = v[3] = box->x1;
4355	v[4] = v[1] = box->y2;
4356	v[7] = box->y1;
4357	v[8] = v[5] = v[2] = opacity;
4358}
4359
4360sse2 fastcall static void
4361gen3_emit_composite_spans_primitive_constant__sse2__no_offset__boxes(const struct sna_composite_spans_op *op,
4362								     const struct sna_opacity_box *b,
4363								     int nbox, float *v)
4364{
4365	do {
4366		v[0] = b->box.x2;
4367		v[6] = v[3] = b->box.x1;
4368		v[4] = v[1] = b->box.y2;
4369		v[7] = b->box.y1;
4370		v[8] = v[5] = v[2] = b->alpha;
4371
4372		v += 9;
4373		b++;
4374	} while (--nbox);
4375}
4376
4377sse2 fastcall static void
4378gen3_emit_composite_spans_primitive_identity_source__sse2(struct sna *sna,
4379							  const struct sna_composite_spans_op *op,
4380							  const BoxRec *box,
4381							  float opacity)
4382{
4383	float *v = sna->render.vertices + sna->render.vertex_used;
4384	sna->render.vertex_used += 15;
4385	assert(sna->render.vertex_used <= sna->render.vertex_size);
4386
4387	v[0] = op->base.dst.x + box->x2;
4388	v[1] = op->base.dst.y + box->y2;
4389	v[2] = (op->base.src.offset[0] + box->x2) * op->base.src.scale[0];
4390	v[3] = (op->base.src.offset[1] + box->y2) * op->base.src.scale[1];
4391	v[4] = opacity;
4392
4393	v[5] = op->base.dst.x + box->x1;
4394	v[6] = v[1];
4395	v[7] = (op->base.src.offset[0] + box->x1) * op->base.src.scale[0];
4396	v[8] = v[3];
4397	v[9] = opacity;
4398
4399	v[10] = v[5];
4400	v[11] = op->base.dst.y + box->y1;
4401	v[12] = v[7];
4402	v[13] = (op->base.src.offset[1] + box->y1) * op->base.src.scale[1];
4403	v[14] = opacity;
4404}
4405
4406sse2 fastcall static void
4407gen3_emit_composite_spans_primitive_identity_source__sse2__boxes(const struct sna_composite_spans_op *op,
4408								 const struct sna_opacity_box *b,
4409								 int nbox,
4410								 float *v)
4411{
4412	do {
4413		v[0] = op->base.dst.x + b->box.x2;
4414		v[1] = op->base.dst.y + b->box.y2;
4415		v[2] = (op->base.src.offset[0] + b->box.x2) * op->base.src.scale[0];
4416		v[3] = (op->base.src.offset[1] + b->box.y2) * op->base.src.scale[1];
4417		v[4] = b->alpha;
4418
4419		v[5] = op->base.dst.x + b->box.x1;
4420		v[6] = v[1];
4421		v[7] = (op->base.src.offset[0] + b->box.x1) * op->base.src.scale[0];
4422		v[8] = v[3];
4423		v[9] = b->alpha;
4424
4425		v[10] = v[5];
4426		v[11] = op->base.dst.y + b->box.y1;
4427		v[12] = v[7];
4428		v[13] = (op->base.src.offset[1] + b->box.y1) * op->base.src.scale[1];
4429		v[14] = b->alpha;
4430
4431		v += 15;
4432		b++;
4433	} while (--nbox);
4434}
4435sse2 fastcall static void
4436gen3_emit_composite_spans_primitive_affine_source__sse2(struct sna *sna,
4437							const struct sna_composite_spans_op *op,
4438							const BoxRec *box,
4439							float opacity)
4440{
4441	PictTransform *transform = op->base.src.transform;
4442	float *v;
4443
4444	v = sna->render.vertices + sna->render.vertex_used;
4445	sna->render.vertex_used += 15;
4446	assert(sna->render.vertex_used <= sna->render.vertex_size);
4447
4448	v[0]  = op->base.dst.x + box->x2;
4449	v[6]  = v[1] = op->base.dst.y + box->y2;
4450	v[10] = v[5] = op->base.dst.x + box->x1;
4451	v[11] = op->base.dst.y + box->y1;
4452	v[14] = v[9] = v[4]  = opacity;
4453
4454	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
4455				    (int)op->base.src.offset[1] + box->y2,
4456				    transform, op->base.src.scale,
4457				    &v[2], &v[3]);
4458
4459	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
4460				    (int)op->base.src.offset[1] + box->y2,
4461				    transform, op->base.src.scale,
4462				    &v[7], &v[8]);
4463
4464	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
4465				    (int)op->base.src.offset[1] + box->y1,
4466				    transform, op->base.src.scale,
4467				    &v[12], &v[13]);
4468}
4469
4470sse2 fastcall static void
4471gen3_emit_composite_spans_primitive_affine_source__sse2__boxes(const struct sna_composite_spans_op *op,
4472							       const struct sna_opacity_box *b,
4473							       int nbox,
4474							       float *v)
4475{
4476	PictTransform *transform = op->base.src.transform;
4477
4478	do {
4479		v[0]  = op->base.dst.x + b->box.x2;
4480		v[6]  = v[1] = op->base.dst.y + b->box.y2;
4481		v[10] = v[5] = op->base.dst.x + b->box.x1;
4482		v[11] = op->base.dst.y + b->box.y1;
4483		v[14] = v[9] = v[4]  = b->alpha;
4484
4485		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x2,
4486					    (int)op->base.src.offset[1] + b->box.y2,
4487					    transform, op->base.src.scale,
4488					    &v[2], &v[3]);
4489
4490		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
4491					    (int)op->base.src.offset[1] + b->box.y2,
4492					    transform, op->base.src.scale,
4493					    &v[7], &v[8]);
4494
4495		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
4496					    (int)op->base.src.offset[1] + b->box.y1,
4497					    transform, op->base.src.scale,
4498					    &v[12], &v[13]);
4499		v += 15;
4500		b++;
4501	} while (--nbox);
4502}
4503
4504sse2 fastcall static void
4505gen3_emit_composite_spans_primitive_identity_gradient__sse2(struct sna *sna,
4506							    const struct sna_composite_spans_op *op,
4507							    const BoxRec *box,
4508							    float opacity)
4509{
4510	float *v = sna->render.vertices + sna->render.vertex_used;
4511	sna->render.vertex_used += 15;
4512	assert(sna->render.vertex_used <= sna->render.vertex_size);
4513
4514	v[0] = op->base.dst.x + box->x2;
4515	v[1] = op->base.dst.y + box->y2;
4516	v[2] = op->base.src.offset[0] + box->x2;
4517	v[3] = op->base.src.offset[1] + box->y2;
4518	v[4] = opacity;
4519
4520	v[5] = op->base.dst.x + box->x1;
4521	v[6] = v[1];
4522	v[7] = op->base.src.offset[0] + box->x1;
4523	v[8] = v[3];
4524	v[9] = opacity;
4525
4526	v[10] = v[5];
4527	v[11] = op->base.dst.y + box->y1;
4528	v[12] = v[7];
4529	v[13] = op->base.src.offset[1] + box->y1;
4530	v[14] = opacity;
4531}
4532
4533sse2 fastcall static void
4534gen3_emit_composite_spans_primitive_identity_gradient__sse2__boxes(const struct sna_composite_spans_op *op,
4535								   const struct sna_opacity_box *b,
4536								   int nbox,
4537								   float *v)
4538{
4539	do {
4540		v[0] = op->base.dst.x + b->box.x2;
4541		v[1] = op->base.dst.y + b->box.y2;
4542		v[2] = op->base.src.offset[0] + b->box.x2;
4543		v[3] = op->base.src.offset[1] + b->box.y2;
4544		v[4] = b->alpha;
4545
4546		v[5] = op->base.dst.x + b->box.x1;
4547		v[6] = v[1];
4548		v[7] = op->base.src.offset[0] + b->box.x1;
4549		v[8] = v[3];
4550		v[9] = b->alpha;
4551
4552		v[10] = v[5];
4553		v[11] = op->base.dst.y + b->box.y1;
4554		v[12] = v[7];
4555		v[13] = op->base.src.offset[1] + b->box.y1;
4556		v[14] = b->alpha;
4557
4558		v += 15;
4559		b++;
4560	} while (--nbox);
4561}
4562
4563sse2 fastcall static void
4564gen3_emit_composite_spans_primitive_affine_gradient__sse2(struct sna *sna,
4565							  const struct sna_composite_spans_op *op,
4566							  const BoxRec *box,
4567							  float opacity)
4568{
4569	PictTransform *transform = op->base.src.transform;
4570	float *v = sna->render.vertices + sna->render.vertex_used;
4571	sna->render.vertex_used += 15;
4572	assert(sna->render.vertex_used <= sna->render.vertex_size);
4573
4574	v[0] = op->base.dst.x + box->x2;
4575	v[1] = op->base.dst.y + box->y2;
4576	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
4577				    op->base.src.offset[1] + box->y2,
4578				    transform, op->base.src.scale,
4579				    &v[2], &v[3]);
4580	v[4] = opacity;
4581
4582	v[5] = op->base.dst.x + box->x1;
4583	v[6] = v[1];
4584	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
4585				    op->base.src.offset[1] + box->y2,
4586				    transform, op->base.src.scale,
4587				    &v[7], &v[8]);
4588	v[9] = opacity;
4589
4590	v[10] = v[5];
4591	v[11] = op->base.dst.y + box->y1;
4592	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
4593				    op->base.src.offset[1] + box->y1,
4594				    transform, op->base.src.scale,
4595				    &v[12], &v[13]);
4596	v[14] = opacity;
4597}
4598
4599sse2 fastcall static void
4600gen3_emit_composite_spans_primitive_affine_gradient__sse2__boxes(const struct sna_composite_spans_op *op,
4601								 const struct sna_opacity_box *b,
4602								 int nbox,
4603								 float *v)
4604{
4605	PictTransform *transform = op->base.src.transform;
4606
4607	do {
4608		v[0] = op->base.dst.x + b->box.x2;
4609		v[1] = op->base.dst.y + b->box.y2;
4610		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
4611					    op->base.src.offset[1] + b->box.y2,
4612					    transform, op->base.src.scale,
4613					    &v[2], &v[3]);
4614		v[4] = b->alpha;
4615
4616		v[5] = op->base.dst.x + b->box.x1;
4617		v[6] = v[1];
4618		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
4619					    op->base.src.offset[1] + b->box.y2,
4620					    transform, op->base.src.scale,
4621					    &v[7], &v[8]);
4622		v[9] = b->alpha;
4623
4624		v[10] = v[5];
4625		v[11] = op->base.dst.y + b->box.y1;
4626		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
4627					    op->base.src.offset[1] + b->box.y1,
4628					    transform, op->base.src.scale,
4629					    &v[12], &v[13]);
4630		v[14] = b->alpha;
4631		v += 15;
4632		b++;
4633	} while (--nbox);
4634}
4635#endif
4636
4637fastcall static void
4638gen3_emit_composite_spans_primitive_affine_gradient(struct sna *sna,
4639						    const struct sna_composite_spans_op *op,
4640						    const BoxRec *box,
4641						    float opacity)
4642{
4643	PictTransform *transform = op->base.src.transform;
4644	float *v = sna->render.vertices + sna->render.vertex_used;
4645	sna->render.vertex_used += 15;
4646	assert(sna->render.vertex_used <= sna->render.vertex_size);
4647
4648	v[0] = op->base.dst.x + box->x2;
4649	v[1] = op->base.dst.y + box->y2;
4650	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
4651				    op->base.src.offset[1] + box->y2,
4652				    transform, op->base.src.scale,
4653				    &v[2], &v[3]);
4654	v[4] = opacity;
4655
4656	v[5] = op->base.dst.x + box->x1;
4657	v[6] = v[1];
4658	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
4659				    op->base.src.offset[1] + box->y2,
4660				    transform, op->base.src.scale,
4661				    &v[7], &v[8]);
4662	v[9] = opacity;
4663
4664	v[10] = v[5];
4665	v[11] = op->base.dst.y + box->y1;
4666	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
4667				    op->base.src.offset[1] + box->y1,
4668				    transform, op->base.src.scale,
4669				    &v[12], &v[13]);
4670	v[14] = opacity;
4671}
4672
4673fastcall static void
4674gen3_emit_composite_spans_primitive_affine_gradient__boxes(const struct sna_composite_spans_op *op,
4675							   const struct sna_opacity_box *b,
4676							   int nbox,
4677							   float *v)
4678{
4679	PictTransform *transform = op->base.src.transform;
4680
4681	do {
4682		v[0] = op->base.dst.x + b->box.x2;
4683		v[1] = op->base.dst.y + b->box.y2;
4684		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
4685					    op->base.src.offset[1] + b->box.y2,
4686					    transform, op->base.src.scale,
4687					    &v[2], &v[3]);
4688		v[4] = b->alpha;
4689
4690		v[5] = op->base.dst.x + b->box.x1;
4691		v[6] = v[1];
4692		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
4693					    op->base.src.offset[1] + b->box.y2,
4694					    transform, op->base.src.scale,
4695					    &v[7], &v[8]);
4696		v[9] = b->alpha;
4697
4698		v[10] = v[5];
4699		v[11] = op->base.dst.y + b->box.y1;
4700		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
4701					    op->base.src.offset[1] + b->box.y1,
4702					    transform, op->base.src.scale,
4703					    &v[12], &v[13]);
4704		v[14] = b->alpha;
4705		v += 15;
4706		b++;
4707	} while (--nbox);
4708}
4709
4710fastcall static void
4711gen3_emit_composite_spans_primitive(struct sna *sna,
4712				    const struct sna_composite_spans_op *op,
4713				    const BoxRec *box,
4714				    float opacity)
4715{
4716	gen3_emit_composite_spans_vertex(sna, op,
4717					 box->x2, box->y2,
4718					 opacity);
4719	gen3_emit_composite_spans_vertex(sna, op,
4720					 box->x1, box->y2,
4721					 opacity);
4722	gen3_emit_composite_spans_vertex(sna, op,
4723					 box->x1, box->y1,
4724					 opacity);
4725}
4726
4727fastcall static void
4728gen3_render_composite_spans_constant_box(struct sna *sna,
4729					 const struct sna_composite_spans_op *op,
4730					 const BoxRec *box, float opacity)
4731{
4732	float *v;
4733	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
4734	     __FUNCTION__,
4735	     op->base.src.offset[0], op->base.src.offset[1],
4736	     opacity,
4737	     op->base.dst.x, op->base.dst.y,
4738	     box->x1, box->y1,
4739	     box->x2 - box->x1,
4740	     box->y2 - box->y1));
4741
4742	gen3_get_rectangles(sna, &op->base, 1);
4743
4744	v = sna->render.vertices + sna->render.vertex_used;
4745	sna->render.vertex_used += 9;
4746	assert(sna->render.vertex_used <= sna->render.vertex_size);
4747
4748	v[0] = box->x2;
4749	v[6] = v[3] = box->x1;
4750	v[4] = v[1] = box->y2;
4751	v[7] = box->y1;
4752	v[8] = v[5] = v[2] = opacity;
4753}
4754
4755fastcall static void
4756gen3_render_composite_spans_constant_thread_boxes(struct sna *sna,
4757						  const struct sna_composite_spans_op *op,
4758						  const struct sna_opacity_box *box,
4759						  int nbox)
4760{
4761	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
4762	     __FUNCTION__, nbox,
4763	     op->base.src.offset[0], op->base.src.offset[1],
4764	     op->base.dst.x, op->base.dst.y));
4765
4766	sna_vertex_lock(&sna->render);
4767	do {
4768		int nbox_this_time;
4769		float *v;
4770
4771		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
4772		assert(nbox_this_time);
4773		nbox -= nbox_this_time;
4774
4775		v = sna->render.vertices + sna->render.vertex_used;
4776		sna->render.vertex_used += nbox_this_time * 9;
4777		assert(sna->render.vertex_used <= sna->render.vertex_size);
4778
4779		sna_vertex_acquire__locked(&sna->render);
4780		sna_vertex_unlock(&sna->render);
4781
4782		do {
4783			v[0] = box->box.x2;
4784			v[6] = v[3] = box->box.x1;
4785			v[4] = v[1] = box->box.y2;
4786			v[7] = box->box.y1;
4787			v[8] = v[5] = v[2] = box->alpha;
4788			v += 9;
4789			box++;
4790		} while (--nbox_this_time);
4791
4792		sna_vertex_lock(&sna->render);
4793		sna_vertex_release__locked(&sna->render);
4794	} while (nbox);
4795	sna_vertex_unlock(&sna->render);
4796}
4797
4798fastcall static void
4799gen3_render_composite_spans_box(struct sna *sna,
4800				const struct sna_composite_spans_op *op,
4801				const BoxRec *box, float opacity)
4802{
4803	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
4804	     __FUNCTION__,
4805	     op->base.src.offset[0], op->base.src.offset[1],
4806	     opacity,
4807	     op->base.dst.x, op->base.dst.y,
4808	     box->x1, box->y1,
4809	     box->x2 - box->x1,
4810	     box->y2 - box->y1));
4811
4812	gen3_get_rectangles(sna, &op->base, 1);
4813	op->prim_emit(sna, op, box, opacity);
4814}
4815
4816static void
4817gen3_render_composite_spans_boxes(struct sna *sna,
4818				  const struct sna_composite_spans_op *op,
4819				  const BoxRec *box, int nbox,
4820				  float opacity)
4821{
4822	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
4823	     __FUNCTION__, nbox,
4824	     op->base.src.offset[0], op->base.src.offset[1],
4825	     opacity,
4826	     op->base.dst.x, op->base.dst.y));
4827
4828	do {
4829		int nbox_this_time;
4830
4831		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
4832		nbox -= nbox_this_time;
4833
4834		do {
4835			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
4836			     box->x1, box->y1,
4837			     box->x2 - box->x1,
4838			     box->y2 - box->y1));
4839
4840			op->prim_emit(sna, op, box++, opacity);
4841		} while (--nbox_this_time);
4842	} while (nbox);
4843}
4844
4845fastcall static void
4846gen3_render_composite_spans_boxes__thread(struct sna *sna,
4847					  const struct sna_composite_spans_op *op,
4848					  const struct sna_opacity_box *box,
4849					  int nbox)
4850{
4851	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
4852	     __FUNCTION__, nbox,
4853	     op->base.src.offset[0], op->base.src.offset[1],
4854	     op->base.dst.x, op->base.dst.y));
4855
4856	sna_vertex_lock(&sna->render);
4857	do {
4858		int nbox_this_time;
4859		float *v;
4860
4861		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
4862		assert(nbox_this_time);
4863		nbox -= nbox_this_time;
4864
4865		v = sna->render.vertices + sna->render.vertex_used;
4866		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
4867		assert(sna->render.vertex_used <= sna->render.vertex_size);
4868
4869		sna_vertex_acquire__locked(&sna->render);
4870		sna_vertex_unlock(&sna->render);
4871
4872		op->emit_boxes(op, box, nbox_this_time, v);
4873		box += nbox_this_time;
4874
4875		sna_vertex_lock(&sna->render);
4876		sna_vertex_release__locked(&sna->render);
4877	} while (nbox);
4878	sna_vertex_unlock(&sna->render);
4879}
4880
4881fastcall static void
4882gen3_render_composite_spans_done(struct sna *sna,
4883				 const struct sna_composite_spans_op *op)
4884{
4885	if (sna->render.vertex_offset)
4886		gen3_vertex_flush(sna);
4887
4888	DBG(("%s()\n", __FUNCTION__));
4889
4890	if (op->base.src.bo)
4891		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
4892
4893	sna_render_composite_redirect_done(sna, &op->base);
4894}
4895
4896static bool
4897gen3_check_composite_spans(struct sna *sna,
4898			   uint8_t op, PicturePtr src, PicturePtr dst,
4899			   int16_t width, int16_t height, unsigned flags)
4900{
4901	if (op >= ARRAY_SIZE(gen3_blend_op))
4902		return false;
4903
4904	if (gen3_composite_fallback(sna, op, src, NULL, dst))
4905		return false;
4906
4907	if (need_tiling(sna, width, height) &&
4908	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
4909		DBG(("%s: fallback, tiled operation not on GPU\n",
4910		     __FUNCTION__));
4911		return false;
4912	}
4913
4914	return true;
4915}
4916
4917static bool
4918gen3_render_composite_spans(struct sna *sna,
4919			    uint8_t op,
4920			    PicturePtr src,
4921			    PicturePtr dst,
4922			    int16_t src_x,  int16_t src_y,
4923			    int16_t dst_x,  int16_t dst_y,
4924			    int16_t width,  int16_t height,
4925			    unsigned flags,
4926			    struct sna_composite_spans_op *tmp)
4927{
4928	bool no_offset;
4929
4930	DBG(("%s(src=(%d, %d), dst=(%d, %d), size=(%d, %d))\n", __FUNCTION__,
4931	     src_x, src_y, dst_x, dst_y, width, height));
4932
4933	assert(gen3_check_composite_spans(sna, op, src, dst, width, height, flags));
4934
4935	if (need_tiling(sna, width, height)) {
4936		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
4937		     __FUNCTION__, width, height));
4938		return sna_tiling_composite_spans(op, src, dst,
4939						  src_x, src_y, dst_x, dst_y,
4940						  width, height, flags, tmp);
4941	}
4942
4943	if (!gen3_composite_set_target(sna, &tmp->base, dst,
4944				       dst_x, dst_y, width, height,
4945				       true)) {
4946		DBG(("%s: unable to set render target\n",
4947		     __FUNCTION__));
4948		return false;
4949	}
4950
4951	tmp->base.op = op;
4952	tmp->base.rb_reversed = gen3_dst_rb_reversed(tmp->base.dst.format);
4953	tmp->base.src.u.gen3.type = SHADER_TEXTURE;
4954	tmp->base.src.is_affine = true;
4955	DBG(("%s: preparing source\n", __FUNCTION__));
4956	switch (gen3_composite_picture(sna, src, &tmp->base, &tmp->base.src,
4957				       src_x, src_y,
4958				       width, height,
4959				       dst_x, dst_y,
4960				       dst->polyMode == PolyModePrecise)) {
4961	case -1:
4962		goto cleanup_dst;
4963	case 0:
4964		tmp->base.src.u.gen3.type = SHADER_ZERO;
4965		break;
4966	case 1:
4967		gen3_composite_channel_convert(&tmp->base.src);
4968		break;
4969	}
4970	DBG(("%s: source type=%d\n", __FUNCTION__, tmp->base.src.u.gen3.type));
4971
4972	if (tmp->base.src.u.gen3.type != SHADER_ZERO)
4973		tmp->base.mask.u.gen3.type = SHADER_OPACITY;
4974
4975	no_offset = tmp->base.dst.x == 0 && tmp->base.dst.y == 0;
4976	tmp->box   = gen3_render_composite_spans_box;
4977	tmp->boxes = gen3_render_composite_spans_boxes;
4978	tmp->thread_boxes = gen3_render_composite_spans_boxes__thread;
4979	tmp->done  = gen3_render_composite_spans_done;
4980	tmp->prim_emit = gen3_emit_composite_spans_primitive;
4981	switch (tmp->base.src.u.gen3.type) {
4982	case SHADER_NONE:
4983		assert(0);
4984	case SHADER_ZERO:
4985		if (no_offset) {
4986			tmp->prim_emit = gen3_emit_composite_spans_primitive_zero_no_offset;
4987			tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero_no_offset__boxes;
4988		} else {
4989			tmp->prim_emit = gen3_emit_composite_spans_primitive_zero;
4990			tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero__boxes;
4991		}
4992		break;
4993	case SHADER_BLACK:
4994	case SHADER_WHITE:
4995	case SHADER_CONSTANT:
4996		if (no_offset) {
4997#if defined(sse2) && !defined(__x86_64__)
4998			if (sna->cpu_features & SSE2) {
4999				tmp->box = gen3_render_composite_spans_constant_box__sse2;
5000				tmp->thread_boxes = gen3_render_composite_spans_constant_thread__sse2__boxes;
5001				tmp->prim_emit = gen3_emit_composite_spans_primitive_constant__sse2__no_offset;
5002				tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__sse2__no_offset__boxes;
5003			} else
5004#endif
5005			{
5006				tmp->box = gen3_render_composite_spans_constant_box;
5007				tmp->thread_boxes = gen3_render_composite_spans_constant_thread_boxes;
5008				tmp->prim_emit = gen3_emit_composite_spans_primitive_constant_no_offset;
5009				tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant_no_offset__boxes;
5010			}
5011		} else {
5012#if defined(sse2) && !defined(__x86_64__)
5013			if (sna->cpu_features & SSE2) {
5014				tmp->prim_emit = gen3_emit_composite_spans_primitive_constant__sse2;
5015				tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__sse2__boxes;
5016			} else
5017#endif
5018			{
5019				tmp->prim_emit = gen3_emit_composite_spans_primitive_constant;
5020				tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__boxes;
5021			}
5022		}
5023		break;
5024	case SHADER_LINEAR:
5025	case SHADER_RADIAL:
5026		if (tmp->base.src.transform == NULL) {
5027#if defined(sse2) && !defined(__x86_64__)
5028			if (sna->cpu_features & SSE2) {
5029				tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_gradient__sse2;
5030				tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_gradient__sse2__boxes;
5031			} else
5032#endif
5033			{
5034				tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_gradient;
5035				tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_gradient__boxes;
5036			}
5037		} else if (tmp->base.src.is_affine) {
5038			tmp->base.src.scale[1] = tmp->base.src.scale[0] = 1. / tmp->base.src.transform->matrix[2][2];
5039#if defined(sse2) && !defined(__x86_64__)
5040			if (sna->cpu_features & SSE2) {
5041				tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_gradient__sse2;
5042				tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_gradient__sse2__boxes;
5043			} else
5044#endif
5045			{
5046				tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_gradient;
5047				tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_gradient__boxes;
5048			}
5049		}
5050		break;
5051	case SHADER_TEXTURE:
5052		if (tmp->base.src.transform == NULL) {
5053#if defined(sse2) && !defined(__x86_64__)
5054			if (sna->cpu_features & SSE2) {
5055				tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_source__sse2;
5056				tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_source__sse2__boxes;
5057			} else
5058#endif
5059			{
5060				tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_source;
5061				tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_source__boxes;
5062			}
5063		} else if (tmp->base.src.is_affine) {
5064			tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
5065			tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
5066#if defined(sse2) && !defined(__x86_64__)
5067			if (sna->cpu_features & SSE2) {
5068				tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_source__sse2;
5069				tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_source__sse2__boxes;
5070			} else
5071#endif
5072			{
5073				tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_source;
5074				tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_source__boxes;
5075			}
5076		}
5077		break;
5078	}
5079	if (tmp->emit_boxes == NULL)
5080		tmp->thread_boxes = NULL;
5081
5082	tmp->base.mask.bo = NULL;
5083
5084	tmp->base.floats_per_vertex = 2;
5085	if (!is_constant_ps(tmp->base.src.u.gen3.type))
5086		tmp->base.floats_per_vertex += tmp->base.src.is_affine ? 2 : 3;
5087	tmp->base.floats_per_vertex +=
5088		tmp->base.mask.u.gen3.type == SHADER_OPACITY;
5089	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
5090
5091	if (!kgem_check_bo(&sna->kgem,
5092			   tmp->base.dst.bo, tmp->base.src.bo,
5093			   NULL)) {
5094		kgem_submit(&sna->kgem);
5095		if (!kgem_check_bo(&sna->kgem,
5096				   tmp->base.dst.bo, tmp->base.src.bo,
5097				   NULL))
5098			goto cleanup_src;
5099	}
5100
5101	gen3_align_vertex(sna, &tmp->base);
5102	gen3_emit_composite_state(sna, &tmp->base);
5103	return true;
5104
5105cleanup_src:
5106	if (tmp->base.src.bo)
5107		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
5108cleanup_dst:
5109	if (tmp->base.redirect.real_bo)
5110		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
5111	return false;
5112}
5113
5114static void
5115gen3_emit_video_state(struct sna *sna,
5116		      struct sna_video *video,
5117		      struct sna_video_frame *frame,
5118		      PixmapPtr pixmap,
5119		      struct kgem_bo *dst_bo,
5120		      int width, int height,
5121		      bool bilinear)
5122{
5123	struct gen3_render_state *state = &sna->render_state.gen3;
5124	uint32_t id, ms3, rewind;
5125
5126	gen3_emit_target(sna, dst_bo, width, height,
5127			 sna_format_for_depth(pixmap->drawable.depth));
5128
5129	/* XXX share with composite? Is it worth the effort? */
5130	if ((state->last_shader & (1<<31)) == 0) {
5131		OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
5132			  I1_LOAD_S(1) | I1_LOAD_S(2) | I1_LOAD_S(6) |
5133			  2);
5134		OUT_BATCH((4 << S1_VERTEX_WIDTH_SHIFT) | (4 << S1_VERTEX_PITCH_SHIFT));
5135		OUT_BATCH(S2_TEXCOORD_FMT(0, TEXCOORDFMT_2D) |
5136			  S2_TEXCOORD_FMT(1, TEXCOORDFMT_NOT_PRESENT) |
5137			  S2_TEXCOORD_FMT(2, TEXCOORDFMT_NOT_PRESENT) |
5138			  S2_TEXCOORD_FMT(3, TEXCOORDFMT_NOT_PRESENT) |
5139			  S2_TEXCOORD_FMT(4, TEXCOORDFMT_NOT_PRESENT) |
5140			  S2_TEXCOORD_FMT(5, TEXCOORDFMT_NOT_PRESENT) |
5141			  S2_TEXCOORD_FMT(6, TEXCOORDFMT_NOT_PRESENT) |
5142			  S2_TEXCOORD_FMT(7, TEXCOORDFMT_NOT_PRESENT));
5143		OUT_BATCH((2 << S6_CBUF_SRC_BLEND_FACT_SHIFT) |
5144			  (1 << S6_CBUF_DST_BLEND_FACT_SHIFT) |
5145			  S6_COLOR_WRITE_ENABLE);
5146
5147		state->last_blend = 0;
5148		state->floats_per_vertex = 4;
5149	}
5150
5151	if (!is_planar_fourcc(frame->id)) {
5152		rewind = sna->kgem.nbatch;
5153		OUT_BATCH(_3DSTATE_PIXEL_SHADER_CONSTANTS | 4);
5154		OUT_BATCH(0x0000001);	/* constant 0 */
5155		/* constant 0: brightness/contrast */
5156		OUT_BATCH_F(video->brightness / 128.0);
5157		OUT_BATCH_F(video->contrast / 255.0);
5158		OUT_BATCH_F(0.0);
5159		OUT_BATCH_F(0.0);
5160		if (state->last_constants &&
5161		    memcmp(&sna->kgem.batch[state->last_constants],
5162			   &sna->kgem.batch[rewind],
5163			   6*sizeof(uint32_t)) == 0)
5164			sna->kgem.nbatch = rewind;
5165		else
5166			state->last_constants = rewind;
5167
5168		rewind = sna->kgem.nbatch;
5169		OUT_BATCH(_3DSTATE_SAMPLER_STATE | 3);
5170		OUT_BATCH(0x00000001);
5171		OUT_BATCH(SS2_COLORSPACE_CONVERSION |
5172			  (FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
5173			  (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
5174		OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
5175			  (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
5176			  (0 << SS3_TEXTUREMAP_INDEX_SHIFT) |
5177			  SS3_NORMALIZED_COORDS);
5178		OUT_BATCH(0x00000000);
5179		if (state->last_sampler &&
5180		    memcmp(&sna->kgem.batch[state->last_sampler],
5181			   &sna->kgem.batch[rewind],
5182			   5*sizeof(uint32_t)) == 0)
5183			sna->kgem.nbatch = rewind;
5184		else
5185			state->last_sampler = rewind;
5186
5187		OUT_BATCH(_3DSTATE_MAP_STATE | 3);
5188		OUT_BATCH(0x00000001);	/* texture map #1 */
5189		OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
5190					 frame->bo,
5191					 I915_GEM_DOMAIN_SAMPLER << 16,
5192					 0));
5193
5194		ms3 = MAPSURF_422;
5195		switch (frame->id) {
5196		case FOURCC_YUY2:
5197			ms3 |= MT_422_YCRCB_NORMAL;
5198			break;
5199		case FOURCC_UYVY:
5200			ms3 |= MT_422_YCRCB_SWAPY;
5201			break;
5202		}
5203		ms3 |= (frame->height - 1) << MS3_HEIGHT_SHIFT;
5204		ms3 |= (frame->width - 1) << MS3_WIDTH_SHIFT;
5205		OUT_BATCH(ms3);
5206		OUT_BATCH(((frame->pitch[0] / 4) - 1) << MS4_PITCH_SHIFT);
5207
5208		id = 1<<31 | 1<<1 | !!video->brightness;
5209		if (state->last_shader != id) {
5210			state->last_shader = id;
5211			id = sna->kgem.nbatch++;
5212
5213			gen3_fs_dcl(FS_S0);
5214			gen3_fs_dcl(FS_T0);
5215			gen3_fs_texld(FS_OC, FS_S0, FS_T0);
5216			if (video->brightness != 0) {
5217				gen3_fs_add(FS_OC,
5218					    gen3_fs_operand_reg(FS_OC),
5219					    gen3_fs_operand(FS_C0, X, X, X, ZERO));
5220			}
5221
5222			sna->kgem.batch[id] =
5223				_3DSTATE_PIXEL_SHADER_PROGRAM |
5224				(sna->kgem.nbatch - id - 2);
5225		}
5226	} else {
5227		/* For the planar formats, we set up three samplers --
5228		 * one for each plane, in a Y8 format.  Because I
5229		 * couldn't get the special PLANAR_TO_PACKED
5230		 * shader setup to work, I did the manual pixel shader:
5231		 *
5232		 * y' = y - .0625
5233		 * u' = u - .5
5234		 * v' = v - .5;
5235		 *
5236		 * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
5237		 * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
5238		 * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
5239		 *
5240		 * register assignment:
5241		 * r0 = (y',u',v',0)
5242		 * r1 = (y,y,y,y)
5243		 * r2 = (u,u,u,u)
5244		 * r3 = (v,v,v,v)
5245		 * OC = (r,g,b,1)
5246		 */
5247		rewind = sna->kgem.nbatch;
5248		OUT_BATCH(_3DSTATE_PIXEL_SHADER_CONSTANTS | (22 - 2));
5249		OUT_BATCH(0x000001f);	/* constants 0-4 */
5250		/* constant 0: normalization offsets */
5251		OUT_BATCH_F(-0.0625);
5252		OUT_BATCH_F(-0.5);
5253		OUT_BATCH_F(-0.5);
5254		OUT_BATCH_F(0.0);
5255		/* constant 1: r coefficients */
5256		OUT_BATCH_F(1.1643);
5257		OUT_BATCH_F(0.0);
5258		OUT_BATCH_F(1.5958);
5259		OUT_BATCH_F(0.0);
5260		/* constant 2: g coefficients */
5261		OUT_BATCH_F(1.1643);
5262		OUT_BATCH_F(-0.39173);
5263		OUT_BATCH_F(-0.81290);
5264		OUT_BATCH_F(0.0);
5265		/* constant 3: b coefficients */
5266		OUT_BATCH_F(1.1643);
5267		OUT_BATCH_F(2.017);
5268		OUT_BATCH_F(0.0);
5269		OUT_BATCH_F(0.0);
5270		/* constant 4: brightness/contrast */
5271		OUT_BATCH_F(video->brightness / 128.0);
5272		OUT_BATCH_F(video->contrast / 255.0);
5273		OUT_BATCH_F(0.0);
5274		OUT_BATCH_F(0.0);
5275		if (state->last_constants &&
5276		    memcmp(&sna->kgem.batch[state->last_constants],
5277			   &sna->kgem.batch[rewind],
5278			   22*sizeof(uint32_t)) == 0)
5279			sna->kgem.nbatch = rewind;
5280		else
5281			state->last_constants = rewind;
5282
5283		rewind = sna->kgem.nbatch;
5284		OUT_BATCH(_3DSTATE_SAMPLER_STATE | 9);
5285		OUT_BATCH(0x00000007);
5286		/* sampler 0 */
5287		OUT_BATCH((FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
5288			  (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
5289		OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
5290			  (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
5291			  (0 << SS3_TEXTUREMAP_INDEX_SHIFT) |
5292			  SS3_NORMALIZED_COORDS);
5293		OUT_BATCH(0x00000000);
5294		/* sampler 1 */
5295		OUT_BATCH((FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
5296			  (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
5297		OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
5298			  (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
5299			  (1 << SS3_TEXTUREMAP_INDEX_SHIFT) |
5300			  SS3_NORMALIZED_COORDS);
5301		OUT_BATCH(0x00000000);
5302		/* sampler 2 */
5303		OUT_BATCH((FILTER_LINEAR << SS2_MAG_FILTER_SHIFT) |
5304			  (FILTER_LINEAR << SS2_MIN_FILTER_SHIFT));
5305		OUT_BATCH((TEXCOORDMODE_CLAMP_EDGE << SS3_TCX_ADDR_MODE_SHIFT) |
5306			  (TEXCOORDMODE_CLAMP_EDGE << SS3_TCY_ADDR_MODE_SHIFT) |
5307			  (2 << SS3_TEXTUREMAP_INDEX_SHIFT) |
5308			  SS3_NORMALIZED_COORDS);
5309		OUT_BATCH(0x00000000);
5310		if (state->last_sampler &&
5311		    memcmp(&sna->kgem.batch[state->last_sampler],
5312			   &sna->kgem.batch[rewind],
5313			   11*sizeof(uint32_t)) == 0)
5314			sna->kgem.nbatch = rewind;
5315		else
5316			state->last_sampler = rewind;
5317
5318		OUT_BATCH(_3DSTATE_MAP_STATE | 9);
5319		OUT_BATCH(0x00000007);
5320
5321		OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
5322					 frame->bo,
5323					 I915_GEM_DOMAIN_SAMPLER << 16,
5324					 0));
5325
5326		ms3 = MAPSURF_8BIT | MT_8BIT_I8;
5327		ms3 |= (frame->height - 1) << MS3_HEIGHT_SHIFT;
5328		ms3 |= (frame->width - 1) << MS3_WIDTH_SHIFT;
5329		OUT_BATCH(ms3);
5330		/* check to see if Y has special pitch than normal
5331		 * double u/v pitch, e.g i915 XvMC hw requires at
5332		 * least 1K alignment, so Y pitch might
5333		 * be same as U/V's.*/
5334		if (frame->pitch[1])
5335			OUT_BATCH(((frame->pitch[1] / 4) - 1) << MS4_PITCH_SHIFT);
5336		else
5337			OUT_BATCH(((frame->pitch[0] * 2 / 4) - 1) << MS4_PITCH_SHIFT);
5338
5339		OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
5340					 frame->bo,
5341					 I915_GEM_DOMAIN_SAMPLER << 16,
5342					 frame->UBufOffset));
5343
5344		ms3 = MAPSURF_8BIT | MT_8BIT_I8;
5345		ms3 |= (frame->height / 2 - 1) << MS3_HEIGHT_SHIFT;
5346		ms3 |= (frame->width / 2 - 1) << MS3_WIDTH_SHIFT;
5347		OUT_BATCH(ms3);
5348		OUT_BATCH(((frame->pitch[0] / 4) - 1) << MS4_PITCH_SHIFT);
5349
5350		OUT_BATCH(kgem_add_reloc(&sna->kgem, sna->kgem.nbatch,
5351					 frame->bo,
5352					 I915_GEM_DOMAIN_SAMPLER << 16,
5353					 frame->VBufOffset));
5354
5355		ms3 = MAPSURF_8BIT | MT_8BIT_I8;
5356		ms3 |= (frame->height / 2 - 1) << MS3_HEIGHT_SHIFT;
5357		ms3 |= (frame->width / 2 - 1) << MS3_WIDTH_SHIFT;
5358		OUT_BATCH(ms3);
5359		OUT_BATCH(((frame->pitch[0] / 4) - 1) << MS4_PITCH_SHIFT);
5360
5361		id = 1<<31 | 2<<1 | !!video->brightness;
5362		if (state->last_shader != id) {
5363			state->last_shader = id;
5364			id = sna->kgem.nbatch++;
5365
5366			/* Declare samplers */
5367			gen3_fs_dcl(FS_S0);	/* Y */
5368			gen3_fs_dcl(FS_S1);	/* U */
5369			gen3_fs_dcl(FS_S2);	/* V */
5370			gen3_fs_dcl(FS_T0);	/* normalized coords */
5371
5372			/* Load samplers to temporaries. */
5373			gen3_fs_texld(FS_R1, FS_S0, FS_T0);
5374			gen3_fs_texld(FS_R2, FS_S1, FS_T0);
5375			gen3_fs_texld(FS_R3, FS_S2, FS_T0);
5376
5377			/* Move the sampled YUV data in R[123] to the first
5378			 * 3 channels of R0.
5379			 */
5380			gen3_fs_mov_masked(FS_R0, MASK_X,
5381					   gen3_fs_operand_reg(FS_R1));
5382			gen3_fs_mov_masked(FS_R0, MASK_Y,
5383					   gen3_fs_operand_reg(FS_R2));
5384			gen3_fs_mov_masked(FS_R0, MASK_Z,
5385					   gen3_fs_operand_reg(FS_R3));
5386
5387			/* Normalize the YUV data */
5388			gen3_fs_add(FS_R0, gen3_fs_operand_reg(FS_R0),
5389				    gen3_fs_operand_reg(FS_C0));
5390			/* dot-product the YUV data in R0 by the vectors of
5391			 * coefficients for calculating R, G, and B, storing
5392			 * the results in the R, G, or B channels of the output
5393			 * color.  The OC results are implicitly clamped
5394			 * at the end of the program.
5395			 */
5396			gen3_fs_dp3(FS_OC, MASK_X,
5397				    gen3_fs_operand_reg(FS_R0),
5398				    gen3_fs_operand_reg(FS_C1));
5399			gen3_fs_dp3(FS_OC, MASK_Y,
5400				    gen3_fs_operand_reg(FS_R0),
5401				    gen3_fs_operand_reg(FS_C2));
5402			gen3_fs_dp3(FS_OC, MASK_Z,
5403				    gen3_fs_operand_reg(FS_R0),
5404				    gen3_fs_operand_reg(FS_C3));
5405			/* Set alpha of the output to 1.0, by wiring W to 1
5406			 * and not actually using the source.
5407			 */
5408			gen3_fs_mov_masked(FS_OC, MASK_W,
5409					   gen3_fs_operand_one());
5410
5411			if (video->brightness != 0) {
5412				gen3_fs_add(FS_OC,
5413					    gen3_fs_operand_reg(FS_OC),
5414					    gen3_fs_operand(FS_C4, X, X, X, ZERO));
5415			}
5416
5417			sna->kgem.batch[id] =
5418				_3DSTATE_PIXEL_SHADER_PROGRAM |
5419				(sna->kgem.nbatch - id - 2);
5420		}
5421	}
5422}
5423
5424static void
5425gen3_video_get_batch(struct sna *sna, struct kgem_bo *bo)
5426{
5427	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
5428
5429	if (!kgem_check_batch(&sna->kgem, 120) ||
5430	    !kgem_check_reloc(&sna->kgem, 4) ||
5431	    !kgem_check_exec(&sna->kgem, 2)) {
5432		_kgem_submit(&sna->kgem);
5433		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
5434	}
5435
5436	if (sna->render_state.gen3.need_invariant)
5437		gen3_emit_invariant(sna);
5438}
5439
5440static int
5441gen3_get_inline_rectangles(struct sna *sna, int want, int floats_per_vertex)
5442{
5443	int size = floats_per_vertex * 3;
5444	int rem = batch_space(sna) - 1;
5445
5446	if (size * want > rem)
5447		want = rem / size;
5448
5449	return want;
5450}
5451
5452static bool
5453gen3_render_video(struct sna *sna,
5454		  struct sna_video *video,
5455		  struct sna_video_frame *frame,
5456		  RegionPtr dstRegion,
5457		  PixmapPtr pixmap)
5458{
5459	struct sna_pixmap *priv = sna_pixmap(pixmap);
5460	const BoxRec *pbox = region_rects(dstRegion);
5461	int nbox = region_num_rects(dstRegion);
5462	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
5463	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
5464	int src_width = frame->src.x2 - frame->src.x1;
5465	int src_height = frame->src.y2 - frame->src.y1;
5466	float src_offset_x, src_offset_y;
5467	float src_scale_x, src_scale_y;
5468	int pix_xoff, pix_yoff;
5469	struct kgem_bo *dst_bo;
5470	bool bilinear;
5471	int copy = 0;
5472
5473	DBG(("%s: src:%dx%d (frame:%dx%d) -> dst:%dx%d\n", __FUNCTION__,
5474	     src_width, src_height, frame->width, frame->height, dst_width, dst_height));
5475
5476	assert(priv->gpu_bo);
5477	dst_bo = priv->gpu_bo;
5478
5479	bilinear = src_width != dst_width || src_height != dst_height;
5480
5481	src_scale_x = (float)src_width / dst_width / frame->width;
5482	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
5483
5484	src_scale_y = (float)src_height / dst_height / frame->height;
5485	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
5486	DBG(("%s: src offset (%f, %f), scale (%f, %f)\n",
5487	     __FUNCTION__, src_offset_x, src_offset_y, src_scale_x, src_scale_y));
5488
5489	if (too_large(pixmap->drawable.width, pixmap->drawable.height) ||
5490	    !gen3_check_pitch_3d(dst_bo)) {
5491		int bpp = pixmap->drawable.bitsPerPixel;
5492
5493		if (too_large(dst_width, dst_height))
5494			return false;
5495
5496		dst_bo = kgem_create_2d(&sna->kgem,
5497					dst_width, dst_height, bpp,
5498					kgem_choose_tiling(&sna->kgem,
5499							   I915_TILING_X,
5500							   dst_width, dst_height, bpp),
5501					0);
5502		if (!dst_bo)
5503			return false;
5504
5505		pix_xoff = -dstRegion->extents.x1;
5506		pix_yoff = -dstRegion->extents.y1;
5507		copy = 1;
5508	} else {
5509		pix_xoff = pix_yoff = 0;
5510		dst_width  = pixmap->drawable.width;
5511		dst_height = pixmap->drawable.height;
5512	}
5513
5514	gen3_video_get_batch(sna, dst_bo);
5515	gen3_emit_video_state(sna, video, frame, pixmap,
5516			      dst_bo, dst_width, dst_height, bilinear);
5517	do {
5518		int nbox_this_time = gen3_get_inline_rectangles(sna, nbox, 4);
5519		if (nbox_this_time == 0) {
5520			gen3_video_get_batch(sna, dst_bo);
5521			gen3_emit_video_state(sna, video, frame, pixmap,
5522					      dst_bo, dst_width, dst_height, bilinear);
5523			nbox_this_time = gen3_get_inline_rectangles(sna, nbox, 4);
5524			assert(nbox_this_time);
5525		}
5526		nbox -= nbox_this_time;
5527
5528		OUT_BATCH(PRIM3D_RECTLIST | (12 * nbox_this_time - 1));
5529		do {
5530			int box_x1 = pbox->x1;
5531			int box_y1 = pbox->y1;
5532			int box_x2 = pbox->x2;
5533			int box_y2 = pbox->y2;
5534
5535			pbox++;
5536
5537			DBG(("%s: dst (%d, %d), (%d, %d) + (%d, %d); src (%f, %f), (%f, %f)\n",
5538			     __FUNCTION__, box_x1, box_y1, box_x2, box_y2, pix_xoff, pix_yoff,
5539			     box_x1 * src_scale_x + src_offset_x,
5540			     box_y1 * src_scale_y + src_offset_y,
5541			     box_x2 * src_scale_x + src_offset_x,
5542			     box_y2 * src_scale_y + src_offset_y));
5543
5544			/* bottom right */
5545			OUT_BATCH_F(box_x2 + pix_xoff);
5546			OUT_BATCH_F(box_y2 + pix_yoff);
5547			OUT_BATCH_F(box_x2 * src_scale_x + src_offset_x);
5548			OUT_BATCH_F(box_y2 * src_scale_y + src_offset_y);
5549
5550			/* bottom left */
5551			OUT_BATCH_F(box_x1 + pix_xoff);
5552			OUT_BATCH_F(box_y2 + pix_yoff);
5553			OUT_BATCH_F(box_x1 * src_scale_x + src_offset_x);
5554			OUT_BATCH_F(box_y2 * src_scale_y + src_offset_y);
5555
5556			/* top left */
5557			OUT_BATCH_F(box_x1 + pix_xoff);
5558			OUT_BATCH_F(box_y1 + pix_yoff);
5559			OUT_BATCH_F(box_x1 * src_scale_x + src_offset_x);
5560			OUT_BATCH_F(box_y1 * src_scale_y + src_offset_y);
5561		} while (--nbox_this_time);
5562	} while (nbox);
5563
5564	if (copy) {
5565		sna_blt_copy_boxes(sna, GXcopy,
5566				   dst_bo, -dstRegion->extents.x1, -dstRegion->extents.y1,
5567				   priv->gpu_bo, 0, 0,
5568				   pixmap->drawable.bitsPerPixel,
5569				   region_rects(dstRegion),
5570				   region_num_rects(dstRegion));
5571
5572		kgem_bo_destroy(&sna->kgem, dst_bo);
5573	}
5574
5575	if (!DAMAGE_IS_ALL(priv->gpu_damage))
5576		sna_damage_add(&priv->gpu_damage, dstRegion);
5577
5578	return true;
5579}
5580
5581static void
5582gen3_render_copy_setup_source(struct sna_composite_channel *channel,
5583			      const DrawableRec *draw,
5584			      struct kgem_bo *bo)
5585{
5586	int i;
5587
5588	channel->u.gen3.type = SHADER_TEXTURE;
5589	channel->filter = gen3_filter(PictFilterNearest);
5590	channel->repeat = gen3_texture_repeat(RepeatNone);
5591	channel->width  = draw->width;
5592	channel->height = draw->height;
5593	channel->scale[0] = 1.f/draw->width;
5594	channel->scale[1] = 1.f/draw->height;
5595	channel->offset[0] = 0;
5596	channel->offset[1] = 0;
5597
5598	channel->pict_format = sna_format_for_depth(draw->depth);
5599	if (!gen3_composite_channel_set_format(channel, channel->pict_format)) {
5600		for (i = 0; i < ARRAY_SIZE(gen3_tex_formats); i++) {
5601			if (gen3_tex_formats[i].xfmt == channel->pict_format) {
5602				channel->card_format = gen3_tex_formats[i].card_fmt;
5603				channel->rb_reversed = gen3_tex_formats[i].rb_reversed;
5604				channel->alpha_fixup = true;
5605				break;
5606			}
5607		}
5608	}
5609	assert(channel->card_format);
5610
5611	channel->bo = bo;
5612	channel->is_affine = 1;
5613}
5614
5615static bool
5616gen3_render_copy_boxes(struct sna *sna, uint8_t alu,
5617		       const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
5618		       const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
5619		       const BoxRec *box, int n, unsigned flags)
5620{
5621	struct sna_composite_op tmp;
5622
5623#if NO_COPY_BOXES
5624	if (!sna_blt_compare_depth(src, dst))
5625		return false;
5626
5627	return sna_blt_copy_boxes(sna, alu,
5628				  src_bo, src_dx, src_dy,
5629				  dst_bo, dst_dx, dst_dy,
5630				  dst->bitsPerPixel,
5631				  box, n);
5632#endif
5633
5634	DBG(("%s (%d, %d)->(%d, %d) x %d\n",
5635	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n));
5636
5637	if (sna_blt_compare_depth(src, dst) &&
5638	    sna_blt_copy_boxes(sna, alu,
5639			       src_bo, src_dx, src_dy,
5640			       dst_bo, dst_dx, dst_dy,
5641			       dst->bitsPerPixel,
5642			       box, n))
5643		return true;
5644
5645	if (!(alu == GXcopy || alu == GXclear) ||
5646	    src_bo == dst_bo || /* XXX handle overlap using 3D ? */
5647	    src_bo->pitch > MAX_3D_PITCH ||
5648	    too_large(src->width, src->height)) {
5649fallback_blt:
5650		if (!kgem_bo_can_blt(&sna->kgem, src_bo) ||
5651		    !kgem_bo_can_blt(&sna->kgem, dst_bo))
5652			return false;
5653
5654		return sna_blt_copy_boxes_fallback(sna, alu,
5655						   src, src_bo, src_dx, src_dy,
5656						   dst, dst_bo, dst_dx, dst_dy,
5657						   box, n);
5658	}
5659
5660	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
5661		kgem_submit(&sna->kgem);
5662		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
5663			goto fallback_blt;
5664	}
5665
5666	memset(&tmp, 0, sizeof(tmp));
5667	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
5668
5669	tmp.dst.pixmap = (PixmapPtr)dst;
5670	tmp.dst.width = dst->width;
5671	tmp.dst.height = dst->height;
5672	tmp.dst.format = sna_format_for_depth(dst->depth);
5673	tmp.dst.bo = dst_bo;
5674	tmp.dst.x = tmp.dst.y = 0;
5675	tmp.damage = NULL;
5676
5677	sna_render_composite_redirect_init(&tmp);
5678	if (too_large(tmp.dst.width, tmp.dst.height) ||
5679	    dst_bo->pitch > MAX_3D_PITCH) {
5680		BoxRec extents = box[0];
5681		int i;
5682
5683		for (i = 1; i < n; i++) {
5684			if (box[i].x1 < extents.x1)
5685				extents.x1 = box[i].x1;
5686			if (box[i].y1 < extents.y1)
5687				extents.y1 = box[i].y1;
5688
5689			if (box[i].x2 > extents.x2)
5690				extents.x2 = box[i].x2;
5691			if (box[i].y2 > extents.y2)
5692				extents.y2 = box[i].y2;
5693		}
5694		if (!sna_render_composite_redirect(sna, &tmp,
5695						   extents.x1 + dst_dx,
5696						   extents.y1 + dst_dy,
5697						   extents.x2 - extents.x1,
5698						   extents.y2 - extents.y1,
5699						   n > 1))
5700			goto fallback_tiled;
5701	}
5702
5703	gen3_render_copy_setup_source(&tmp.src, src, src_bo);
5704
5705	tmp.floats_per_vertex = 4;
5706	tmp.floats_per_rect = 12;
5707	tmp.mask.bo = NULL;
5708	tmp.mask.u.gen3.type = SHADER_NONE;
5709
5710	dst_dx += tmp.dst.x;
5711	dst_dy += tmp.dst.y;
5712	tmp.dst.x = tmp.dst.y = 0;
5713
5714	gen3_align_vertex(sna, &tmp);
5715	gen3_emit_composite_state(sna, &tmp);
5716
5717	do {
5718		int n_this_time;
5719
5720		n_this_time = gen3_get_rectangles(sna, &tmp, n);
5721		n -= n_this_time;
5722
5723		do {
5724			DBG(("	(%d, %d) -> (%d, %d) + (%d, %d)\n",
5725			     box->x1 + src_dx, box->y1 + src_dy,
5726			     box->x1 + dst_dx, box->y1 + dst_dy,
5727			     box->x2 - box->x1, box->y2 - box->y1));
5728			OUT_VERTEX(box->x2 + dst_dx);
5729			OUT_VERTEX(box->y2 + dst_dy);
5730			OUT_VERTEX((box->x2 + src_dx) * tmp.src.scale[0]);
5731			OUT_VERTEX((box->y2 + src_dy) * tmp.src.scale[1]);
5732
5733			OUT_VERTEX(box->x1 + dst_dx);
5734			OUT_VERTEX(box->y2 + dst_dy);
5735			OUT_VERTEX((box->x1 + src_dx) * tmp.src.scale[0]);
5736			OUT_VERTEX((box->y2 + src_dy) * tmp.src.scale[1]);
5737
5738			OUT_VERTEX(box->x1 + dst_dx);
5739			OUT_VERTEX(box->y1 + dst_dy);
5740			OUT_VERTEX((box->x1 + src_dx) * tmp.src.scale[0]);
5741			OUT_VERTEX((box->y1 + src_dy) * tmp.src.scale[1]);
5742
5743			box++;
5744		} while (--n_this_time);
5745	} while (n);
5746
5747	gen3_vertex_flush(sna);
5748	sna_render_composite_redirect_done(sna, &tmp);
5749	return true;
5750
5751fallback_tiled:
5752	return sna_tiling_copy_boxes(sna, alu,
5753				     src, src_bo, src_dx, src_dy,
5754				     dst, dst_bo, dst_dx, dst_dy,
5755				     box, n);
5756}
5757
5758static void
5759gen3_render_copy_blt(struct sna *sna,
5760		     const struct sna_copy_op *op,
5761		     int16_t sx, int16_t sy,
5762		     int16_t w, int16_t h,
5763		     int16_t dx, int16_t dy)
5764{
5765	gen3_get_rectangles(sna, &op->base, 1);
5766
5767	OUT_VERTEX(dx+w);
5768	OUT_VERTEX(dy+h);
5769	OUT_VERTEX((sx+w)*op->base.src.scale[0]);
5770	OUT_VERTEX((sy+h)*op->base.src.scale[1]);
5771
5772	OUT_VERTEX(dx);
5773	OUT_VERTEX(dy+h);
5774	OUT_VERTEX(sx*op->base.src.scale[0]);
5775	OUT_VERTEX((sy+h)*op->base.src.scale[1]);
5776
5777	OUT_VERTEX(dx);
5778	OUT_VERTEX(dy);
5779	OUT_VERTEX(sx*op->base.src.scale[0]);
5780	OUT_VERTEX(sy*op->base.src.scale[1]);
5781}
5782
5783static void
5784gen3_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
5785{
5786	if (sna->render.vertex_offset)
5787		gen3_vertex_flush(sna);
5788}
5789
5790static bool
5791gen3_render_copy(struct sna *sna, uint8_t alu,
5792		 PixmapPtr src, struct kgem_bo *src_bo,
5793		 PixmapPtr dst, struct kgem_bo *dst_bo,
5794		 struct sna_copy_op *tmp)
5795{
5796#if NO_COPY
5797	if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
5798		return false;
5799
5800	return sna_blt_copy(sna, alu,
5801			    src_bo, dst_bo,
5802			    dst->drawable.bitsPerPixel,
5803			    tmp);
5804#endif
5805
5806	/* Prefer to use the BLT */
5807	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
5808	    sna_blt_copy(sna, alu,
5809			 src_bo, dst_bo,
5810			 dst->drawable.bitsPerPixel,
5811			 tmp))
5812		return true;
5813
5814	/* Must use the BLT if we can't RENDER... */
5815	if (!(alu == GXcopy || alu == GXclear) ||
5816	    too_large(src->drawable.width, src->drawable.height) ||
5817	    too_large(dst->drawable.width, dst->drawable.height) ||
5818	    src_bo->pitch > MAX_3D_PITCH || dst_bo->pitch > MAX_3D_PITCH) {
5819fallback:
5820		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
5821			return false;
5822
5823		return sna_blt_copy(sna, alu, src_bo, dst_bo,
5824				    dst->drawable.bitsPerPixel,
5825				    tmp);
5826	}
5827
5828	tmp->base.op = alu == GXcopy ? PictOpSrc : PictOpClear;
5829
5830	tmp->base.dst.pixmap = dst;
5831	tmp->base.dst.width = dst->drawable.width;
5832	tmp->base.dst.height = dst->drawable.height;
5833	tmp->base.dst.format = sna_format_for_depth(dst->drawable.depth);
5834	tmp->base.dst.bo = dst_bo;
5835
5836	gen3_render_copy_setup_source(&tmp->base.src, &src->drawable, src_bo);
5837
5838	tmp->base.floats_per_vertex = 4;
5839	tmp->base.floats_per_rect = 12;
5840	tmp->base.mask.bo = NULL;
5841	tmp->base.mask.u.gen3.type = SHADER_NONE;
5842
5843	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
5844		kgem_submit(&sna->kgem);
5845		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
5846			goto fallback;
5847	}
5848
5849	tmp->blt  = gen3_render_copy_blt;
5850	tmp->done = gen3_render_copy_done;
5851
5852	gen3_align_vertex(sna, &tmp->base);
5853	gen3_emit_composite_state(sna, &tmp->base);
5854	return true;
5855}
5856
5857static bool
5858gen3_render_fill_boxes_try_blt(struct sna *sna,
5859			       CARD8 op, PictFormat format,
5860			       const xRenderColor *color,
5861			       const DrawableRec *dst, struct kgem_bo *dst_bo,
5862			       const BoxRec *box, int n)
5863{
5864	uint8_t alu;
5865	uint32_t pixel;
5866
5867	if (dst_bo->tiling == I915_TILING_Y) {
5868		DBG(("%s: y-tiling, can't blit\n", __FUNCTION__));
5869		assert(!too_large(dst->width, dst->height));
5870		return false;
5871	}
5872
5873	if (op > PictOpSrc)
5874		return false;
5875
5876	if (op == PictOpClear) {
5877		alu = GXclear;
5878		pixel = 0;
5879	} else if (!sna_get_pixel_from_rgba(&pixel,
5880					    color->red,
5881					    color->green,
5882					    color->blue,
5883					    color->alpha,
5884					    format))
5885		return false;
5886	else
5887		alu = GXcopy;
5888
5889	return sna_blt_fill_boxes(sna, alu,
5890				  dst_bo, dst->bitsPerPixel,
5891				  pixel, box, n);
5892}
5893
5894static inline bool prefer_fill_blt(struct sna *sna)
5895{
5896#if PREFER_BLT_FILL
5897	return true;
5898#else
5899	return sna->kgem.mode != KGEM_RENDER;
5900#endif
5901}
5902
5903static bool
5904gen3_render_fill_boxes(struct sna *sna,
5905		       CARD8 op,
5906		       PictFormat format,
5907		       const xRenderColor *color,
5908		       const DrawableRec *dst, struct kgem_bo *dst_bo,
5909		       const BoxRec *box, int n)
5910{
5911	struct sna_composite_op tmp;
5912	uint32_t pixel;
5913
5914	if (op >= ARRAY_SIZE(gen3_blend_op)) {
5915		DBG(("%s: fallback due to unhandled blend op: %d\n",
5916		     __FUNCTION__, op));
5917		return false;
5918	}
5919
5920#if NO_FILL_BOXES
5921	return gen3_render_fill_boxes_try_blt(sna, op, format, color,
5922					      dst, dst_bo,
5923					      box, n);
5924#endif
5925
5926	DBG(("%s (op=%d, format=%x, color=(%04x,%04x,%04x, %04x))\n",
5927	     __FUNCTION__, op, (int)format,
5928	     color->red, color->green, color->blue, color->alpha));
5929
5930	if (too_large(dst->width, dst->height) ||
5931	    dst_bo->pitch > MAX_3D_PITCH ||
5932	    !gen3_check_dst_format(format)) {
5933		DBG(("%s: try blt, too large or incompatible destination\n",
5934		     __FUNCTION__));
5935		if (gen3_render_fill_boxes_try_blt(sna, op, format, color,
5936						   dst, dst_bo,
5937						   box, n))
5938			return true;
5939
5940		if (!gen3_check_dst_format(format))
5941			return false;
5942
5943		return sna_tiling_fill_boxes(sna, op, format, color,
5944					     dst, dst_bo, box, n);
5945	}
5946
5947	if (prefer_fill_blt(sna) &&
5948	    gen3_render_fill_boxes_try_blt(sna, op, format, color,
5949					   dst, dst_bo,
5950					   box, n))
5951		return true;
5952
5953	if (op == PictOpClear) {
5954		pixel = 0;
5955	} else {
5956		if (!sna_get_pixel_from_rgba(&pixel,
5957					     color->red,
5958					     color->green,
5959					     color->blue,
5960					     color->alpha,
5961					     PICT_a8r8g8b8)) {
5962			assert(0);
5963			return false;
5964		}
5965	}
5966	DBG(("%s: using shader for op=%d, format=%08x, pixel=%08x\n",
5967	     __FUNCTION__, op, (int)format, pixel));
5968
5969	tmp.op = op;
5970	tmp.dst.pixmap = (PixmapPtr)dst;
5971	tmp.dst.width = dst->width;
5972	tmp.dst.height = dst->height;
5973	tmp.dst.format = format;
5974	tmp.dst.bo = dst_bo;
5975	tmp.damage = NULL;
5976	tmp.floats_per_vertex = 2;
5977	tmp.floats_per_rect = 6;
5978	tmp.rb_reversed = 0;
5979	tmp.has_component_alpha = 0;
5980	tmp.need_magic_ca_pass = false;
5981
5982	gen3_init_solid(&tmp.src, pixel);
5983	tmp.mask.bo = NULL;
5984	tmp.mask.u.gen3.type = SHADER_NONE;
5985	tmp.u.gen3.num_constants = 0;
5986
5987	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
5988		kgem_submit(&sna->kgem);
5989		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
5990			return false;
5991	}
5992
5993	gen3_align_vertex(sna, &tmp);
5994	gen3_emit_composite_state(sna, &tmp);
5995
5996	do {
5997		int n_this_time;
5998
5999		n_this_time = gen3_get_rectangles(sna, &tmp, n);
6000		n -= n_this_time;
6001
6002		do {
6003			DBG(("	(%d, %d), (%d, %d): %x\n",
6004			     box->x1, box->y1, box->x2, box->y2, pixel));
6005			OUT_VERTEX(box->x2);
6006			OUT_VERTEX(box->y2);
6007			OUT_VERTEX(box->x1);
6008			OUT_VERTEX(box->y2);
6009			OUT_VERTEX(box->x1);
6010			OUT_VERTEX(box->y1);
6011			box++;
6012		} while (--n_this_time);
6013	} while (n);
6014
6015	gen3_vertex_flush(sna);
6016	return true;
6017}
6018
6019static void
6020gen3_render_fill_op_blt(struct sna *sna,
6021			const struct sna_fill_op *op,
6022			int16_t x, int16_t y, int16_t w, int16_t h)
6023{
6024	gen3_get_rectangles(sna, &op->base, 1);
6025
6026	OUT_VERTEX(x+w);
6027	OUT_VERTEX(y+h);
6028	OUT_VERTEX(x);
6029	OUT_VERTEX(y+h);
6030	OUT_VERTEX(x);
6031	OUT_VERTEX(y);
6032}
6033
6034fastcall static void
6035gen3_render_fill_op_box(struct sna *sna,
6036			const struct sna_fill_op *op,
6037			const BoxRec *box)
6038{
6039	gen3_get_rectangles(sna, &op->base, 1);
6040
6041	OUT_VERTEX(box->x2);
6042	OUT_VERTEX(box->y2);
6043	OUT_VERTEX(box->x1);
6044	OUT_VERTEX(box->y2);
6045	OUT_VERTEX(box->x1);
6046	OUT_VERTEX(box->y1);
6047}
6048
6049fastcall static void
6050gen3_render_fill_op_boxes(struct sna *sna,
6051			  const struct sna_fill_op *op,
6052			  const BoxRec *box,
6053			  int nbox)
6054{
6055	DBG(("%s: (%d, %d),(%d, %d)... x %d\n", __FUNCTION__,
6056	     box->x1, box->y1, box->x2, box->y2, nbox));
6057
6058	do {
6059		int nbox_this_time;
6060
6061		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
6062		nbox -= nbox_this_time;
6063
6064		do {
6065			OUT_VERTEX(box->x2);
6066			OUT_VERTEX(box->y2);
6067			OUT_VERTEX(box->x1);
6068			OUT_VERTEX(box->y2);
6069			OUT_VERTEX(box->x1);
6070			OUT_VERTEX(box->y1);
6071			box++;
6072		} while (--nbox_this_time);
6073	} while (nbox);
6074}
6075
6076static void
6077gen3_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
6078{
6079	if (sna->render.vertex_offset)
6080		gen3_vertex_flush(sna);
6081}
6082
6083static bool
6084gen3_render_fill(struct sna *sna, uint8_t alu,
6085		 PixmapPtr dst, struct kgem_bo *dst_bo,
6086		 uint32_t color, unsigned flags,
6087		 struct sna_fill_op *tmp)
6088{
6089#if NO_FILL
6090	return sna_blt_fill(sna, alu,
6091			    dst_bo, dst->drawable.bitsPerPixel,
6092			    color,
6093			    tmp);
6094#endif
6095
6096	/* Prefer to use the BLT if already engaged */
6097	if (prefer_fill_blt(sna) &&
6098	    sna_blt_fill(sna, alu,
6099			 dst_bo, dst->drawable.bitsPerPixel,
6100			 color,
6101			 tmp))
6102		return true;
6103
6104	/* Must use the BLT if we can't RENDER... */
6105	if (!(alu == GXcopy || alu == GXclear) ||
6106	    too_large(dst->drawable.width, dst->drawable.height) ||
6107	    dst_bo->pitch > MAX_3D_PITCH)
6108		return sna_blt_fill(sna, alu,
6109				    dst_bo, dst->drawable.bitsPerPixel,
6110				    color,
6111				    tmp);
6112
6113	if (alu == GXclear)
6114		color = 0;
6115
6116	tmp->base.op = color == 0 ? PictOpClear : PictOpSrc;
6117	tmp->base.dst.pixmap = dst;
6118	tmp->base.dst.width = dst->drawable.width;
6119	tmp->base.dst.height = dst->drawable.height;
6120	tmp->base.dst.format = sna_format_for_depth(dst->drawable.depth);
6121	tmp->base.dst.bo = dst_bo;
6122	tmp->base.floats_per_vertex = 2;
6123	tmp->base.floats_per_rect = 6;
6124	tmp->base.need_magic_ca_pass = 0;
6125	tmp->base.has_component_alpha = 0;
6126	tmp->base.rb_reversed = 0;
6127
6128	gen3_init_solid(&tmp->base.src,
6129			sna_rgba_for_color(color, dst->drawable.depth));
6130	tmp->base.mask.bo = NULL;
6131	tmp->base.mask.u.gen3.type = SHADER_NONE;
6132	tmp->base.u.gen3.num_constants = 0;
6133
6134	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
6135		kgem_submit(&sna->kgem);
6136		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
6137			return false;
6138	}
6139
6140	tmp->blt   = gen3_render_fill_op_blt;
6141	tmp->box   = gen3_render_fill_op_box;
6142	tmp->boxes = gen3_render_fill_op_boxes;
6143	tmp->points = NULL;
6144	tmp->done  = gen3_render_fill_op_done;
6145
6146	gen3_align_vertex(sna, &tmp->base);
6147	gen3_emit_composite_state(sna, &tmp->base);
6148	return true;
6149}
6150
6151static bool
6152gen3_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
6153			     uint32_t color,
6154			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
6155			     uint8_t alu)
6156{
6157	BoxRec box;
6158
6159	box.x1 = x1;
6160	box.y1 = y1;
6161	box.x2 = x2;
6162	box.y2 = y2;
6163
6164	return sna_blt_fill_boxes(sna, alu,
6165				  bo, dst->drawable.bitsPerPixel,
6166				  color, &box, 1);
6167}
6168
6169static bool
6170gen3_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
6171		     uint32_t color,
6172		     int16_t x1, int16_t y1,
6173		     int16_t x2, int16_t y2,
6174		     uint8_t alu)
6175{
6176	struct sna_composite_op tmp;
6177
6178#if NO_FILL_ONE
6179	return gen3_render_fill_one_try_blt(sna, dst, bo, color,
6180					    x1, y1, x2, y2, alu);
6181#endif
6182
6183	/* Prefer to use the BLT if already engaged */
6184	if (prefer_fill_blt(sna) &&
6185	    gen3_render_fill_one_try_blt(sna, dst, bo, color,
6186					 x1, y1, x2, y2, alu))
6187		return true;
6188
6189	/* Must use the BLT if we can't RENDER... */
6190	if (!(alu == GXcopy || alu == GXclear) ||
6191	    too_large(dst->drawable.width, dst->drawable.height) ||
6192	    bo->pitch > MAX_3D_PITCH)
6193		return gen3_render_fill_one_try_blt(sna, dst, bo, color,
6194						    x1, y1, x2, y2, alu);
6195
6196	if (alu == GXclear)
6197		color = 0;
6198
6199	tmp.op = color == 0 ? PictOpClear : PictOpSrc;
6200	tmp.dst.pixmap = dst;
6201	tmp.dst.width = dst->drawable.width;
6202	tmp.dst.height = dst->drawable.height;
6203	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
6204	tmp.dst.bo = bo;
6205	tmp.floats_per_vertex = 2;
6206	tmp.floats_per_rect = 6;
6207	tmp.need_magic_ca_pass = 0;
6208	tmp.has_component_alpha = 0;
6209	tmp.rb_reversed = 0;
6210
6211	gen3_init_solid(&tmp.src,
6212			sna_rgba_for_color(color, dst->drawable.depth));
6213	tmp.mask.bo = NULL;
6214	tmp.mask.u.gen3.type = SHADER_NONE;
6215	tmp.u.gen3.num_constants = 0;
6216
6217	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
6218		kgem_submit(&sna->kgem);
6219
6220		if (gen3_render_fill_one_try_blt(sna, dst, bo, color,
6221						 x1, y1, x2, y2, alu))
6222			return true;
6223
6224		if (!kgem_check_bo(&sna->kgem, bo, NULL))
6225			return false;
6226	}
6227
6228	gen3_align_vertex(sna, &tmp);
6229	gen3_emit_composite_state(sna, &tmp);
6230	gen3_get_rectangles(sna, &tmp, 1);
6231	DBG(("	(%d, %d), (%d, %d): %x\n", x1, y1, x2, y2, color));
6232	OUT_VERTEX(x2);
6233	OUT_VERTEX(y2);
6234	OUT_VERTEX(x1);
6235	OUT_VERTEX(y2);
6236	OUT_VERTEX(x1);
6237	OUT_VERTEX(y1);
6238	gen3_vertex_flush(sna);
6239
6240	return true;
6241}
6242
6243static void gen3_render_flush(struct sna *sna)
6244{
6245	gen3_vertex_close(sna);
6246
6247	assert(sna->render.vertex_reloc[0] == 0);
6248	assert(sna->render.vertex_offset == 0);
6249}
6250
6251static void
6252gen3_render_fini(struct sna *sna)
6253{
6254}
6255
6256const char *gen3_render_init(struct sna *sna, const char *backend)
6257{
6258	struct sna_render *render = &sna->render;
6259
6260#if !NO_COMPOSITE
6261	render->composite = gen3_render_composite;
6262	render->prefer_gpu |= PREFER_GPU_RENDER;
6263#endif
6264#if !NO_COMPOSITE_SPANS
6265	render->check_composite_spans = gen3_check_composite_spans;
6266	render->composite_spans = gen3_render_composite_spans;
6267	render->prefer_gpu |= PREFER_GPU_SPANS;
6268#endif
6269
6270	render->video = gen3_render_video;
6271
6272	render->copy_boxes = gen3_render_copy_boxes;
6273	render->copy = gen3_render_copy;
6274
6275	render->fill_boxes = gen3_render_fill_boxes;
6276	render->fill = gen3_render_fill;
6277	render->fill_one = gen3_render_fill_one;
6278
6279	render->reset = gen3_render_reset;
6280	render->flush = gen3_render_flush;
6281	render->fini = gen3_render_fini;
6282
6283	render->max_3d_size = MAX_3D_SIZE;
6284	render->max_3d_pitch = MAX_3D_PITCH;
6285
6286	sna->kgem.retire = gen3_render_retire;
6287	sna->kgem.expire = gen3_render_expire;
6288	return "Alviso (gen3)";
6289}
6290