1/*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Chris Wilson <chris@chris-wilson.co.uk>
25 *
26 */
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "sna.h"
33#include "sna_render.h"
34#include "sna_render_inline.h"
35#include "gen4_vertex.h"
36
37#ifndef sse2
38#define sse2
39#endif
40
41void gen4_vertex_align(struct sna *sna, const struct sna_composite_op *op)
42{
43	int vertex_index;
44
45	assert(op->floats_per_vertex);
46	assert(op->floats_per_rect == 3*op->floats_per_vertex);
47	assert(sna->render.vertex_used <= sna->render.vertex_size);
48
49	vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
50	if ((int)sna->render.vertex_size - vertex_index * op->floats_per_vertex < 2*op->floats_per_rect) {
51		DBG(("%s: flushing vertex buffer: new index=%d, max=%d\n",
52		     __FUNCTION__, vertex_index, sna->render.vertex_size / op->floats_per_vertex));
53		if (gen4_vertex_finish(sna) < 2*op->floats_per_rect) {
54			kgem_submit(&sna->kgem);
55			_kgem_set_mode(&sna->kgem, KGEM_RENDER);
56		}
57		assert(sna->render.vertex_used < sna->render.vertex_size);
58
59		vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
60		assert(vertex_index * op->floats_per_vertex <= sna->render.vertex_size);
61	}
62
63	sna->render.vertex_index = vertex_index;
64	sna->render.vertex_used = vertex_index * op->floats_per_vertex;
65}
66
67void gen4_vertex_flush(struct sna *sna)
68{
69	DBG(("%s[%x] = %d\n", __FUNCTION__,
70	     4*sna->render.vertex_offset,
71	     sna->render.vertex_index - sna->render.vertex_start));
72
73	assert(sna->render.vertex_offset);
74	assert(sna->render.vertex_offset <= sna->kgem.nbatch);
75	assert(sna->render.vertex_index > sna->render.vertex_start);
76	assert(sna->render.vertex_used <= sna->render.vertex_size);
77
78	sna->kgem.batch[sna->render.vertex_offset] =
79		sna->render.vertex_index - sna->render.vertex_start;
80	sna->render.vertex_offset = 0;
81}
82
83int gen4_vertex_finish(struct sna *sna)
84{
85	struct kgem_bo *bo;
86	unsigned int i;
87	unsigned hint, size;
88
89	DBG(("%s: used=%d / %d\n", __FUNCTION__,
90	     sna->render.vertex_used, sna->render.vertex_size));
91	assert(sna->render.vertex_offset == 0);
92	assert(sna->render.vertex_used);
93	assert(sna->render.vertex_used <= sna->render.vertex_size);
94
95	sna_vertex_wait__locked(&sna->render);
96
97	/* Note: we only need dword alignment (currently) */
98
99	hint = CREATE_GTT_MAP;
100
101	bo = sna->render.vbo;
102	if (bo) {
103		for (i = 0; i < sna->render.nvertex_reloc; i++) {
104			DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
105			     i, sna->render.vertex_reloc[i]));
106
107			sna->kgem.batch[sna->render.vertex_reloc[i]] =
108				kgem_add_reloc(&sna->kgem,
109					       sna->render.vertex_reloc[i], bo,
110					       I915_GEM_DOMAIN_VERTEX << 16,
111					       0);
112		}
113
114		assert(!sna->render.active);
115		sna->render.nvertex_reloc = 0;
116		sna->render.vertex_used = 0;
117		sna->render.vertex_index = 0;
118		sna->render.vbo = NULL;
119		sna->render.vb_id = 0;
120
121		kgem_bo_destroy(&sna->kgem, bo);
122		hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
123	} else {
124		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
125		assert(sna->render.vertices == sna->render.vertex_data);
126		if (kgem_is_idle(&sna->kgem))
127			return 0;
128	}
129
130	size = 256*1024;
131	assert(!sna->render.active);
132	sna->render.vertices = NULL;
133	sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
134	while (sna->render.vbo == NULL && size > sizeof(sna->render.vertex_data)) {
135		size /= 2;
136		sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
137	}
138	if (sna->render.vbo == NULL)
139		sna->render.vbo = kgem_create_linear(&sna->kgem,
140						     256*1024, CREATE_GTT_MAP);
141	if (sna->render.vbo &&
142	    kgem_check_bo(&sna->kgem, sna->render.vbo, NULL))
143		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
144	if (sna->render.vertices == NULL) {
145		if (sna->render.vbo) {
146			kgem_bo_destroy(&sna->kgem, sna->render.vbo);
147			sna->render.vbo = NULL;
148		}
149		sna->render.vertices = sna->render.vertex_data;
150		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
151		return 0;
152	}
153
154	if (sna->render.vertex_used) {
155		DBG(("%s: copying initial buffer x %d to handle=%d\n",
156		     __FUNCTION__,
157		     sna->render.vertex_used,
158		     sna->render.vbo->handle));
159		assert(sizeof(float)*sna->render.vertex_used <=
160		       __kgem_bo_size(sna->render.vbo));
161		memcpy(sna->render.vertices,
162		       sna->render.vertex_data,
163		       sizeof(float)*sna->render.vertex_used);
164	}
165
166	size = __kgem_bo_size(sna->render.vbo)/4;
167	if (size >= UINT16_MAX)
168		size = UINT16_MAX - 1;
169
170	DBG(("%s: create vbo handle=%d, size=%d floats [%d bytes]\n",
171	     __FUNCTION__, sna->render.vbo->handle, size, __kgem_bo_size(sna->render.vbo)));
172	assert(size > sna->render.vertex_used);
173
174	sna->render.vertex_size = size;
175	return size - sna->render.vertex_used;
176}
177
178void gen4_vertex_close(struct sna *sna)
179{
180	struct kgem_bo *bo, *free_bo = NULL;
181	unsigned int i, delta = 0;
182
183	assert(sna->render.vertex_offset == 0);
184	if (!sna->render.vb_id)
185		return;
186
187	DBG(("%s: used=%d, vbo active? %d, vb=%x, nreloc=%d\n",
188	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo ? sna->render.vbo->handle : 0,
189	     sna->render.vb_id, sna->render.nvertex_reloc));
190
191	assert(!sna->render.active);
192
193	bo = sna->render.vbo;
194	if (bo) {
195		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
196			DBG(("%s: discarding vbo (full), handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
197			sna->render.vbo = NULL;
198			sna->render.vertices = sna->render.vertex_data;
199			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
200			free_bo = bo;
201		} else if (!sna->kgem.has_llc && sna->render.vertices == MAP(bo->map__cpu)) {
202			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
203			sna->render.vertices =
204				kgem_bo_map__gtt(&sna->kgem, sna->render.vbo);
205			if (sna->render.vertices == NULL) {
206				sna->render.vbo = NULL;
207				sna->render.vertices = sna->render.vertex_data;
208				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
209				free_bo = bo;
210			}
211
212		}
213	} else {
214		int size;
215
216		size  = sna->kgem.nbatch;
217		size += sna->kgem.batch_size - sna->kgem.surface;
218		size += sna->render.vertex_used;
219
220		if (size <= 1024) {
221			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
222			     sna->render.vertex_used, sna->kgem.nbatch));
223			assert(sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface);
224			memcpy(sna->kgem.batch + sna->kgem.nbatch,
225			       sna->render.vertex_data,
226			       sna->render.vertex_used * 4);
227			delta = sna->kgem.nbatch * 4;
228			bo = NULL;
229			sna->kgem.nbatch += sna->render.vertex_used;
230		} else {
231			size = 256 * 1024;
232			do {
233				bo = kgem_create_linear(&sna->kgem, size,
234							CREATE_GTT_MAP | CREATE_NO_RETIRE | CREATE_NO_THROTTLE | CREATE_CACHED);
235			} while (bo == NULL && (size>>=1) > sizeof(float)*sna->render.vertex_used);
236
237			sna->render.vertices = NULL;
238			if (bo)
239				sna->render.vertices = kgem_bo_map(&sna->kgem, bo);
240			if (sna->render.vertices != NULL) {
241				DBG(("%s: new vbo: %d / %d\n", __FUNCTION__,
242				     sna->render.vertex_used, __kgem_bo_size(bo)/4));
243
244				assert(sizeof(float)*sna->render.vertex_used <= __kgem_bo_size(bo));
245				memcpy(sna->render.vertices,
246				       sna->render.vertex_data,
247				       sizeof(float)*sna->render.vertex_used);
248
249				size = __kgem_bo_size(bo)/4;
250				if (size >= UINT16_MAX)
251					size = UINT16_MAX - 1;
252
253				sna->render.vbo = bo;
254				sna->render.vertex_size = size;
255			} else {
256				DBG(("%s: tmp vbo: %d\n", __FUNCTION__,
257				     sna->render.vertex_used));
258
259				if (bo)
260					kgem_bo_destroy(&sna->kgem, bo);
261
262				bo = kgem_create_linear(&sna->kgem,
263							4*sna->render.vertex_used,
264							CREATE_NO_THROTTLE);
265				if (bo && !kgem_bo_write(&sna->kgem, bo,
266							 sna->render.vertex_data,
267							 4*sna->render.vertex_used)) {
268					kgem_bo_destroy(&sna->kgem, bo);
269					bo = NULL;
270				}
271
272				assert(sna->render.vbo == NULL);
273				sna->render.vertices = sna->render.vertex_data;
274				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
275				free_bo = bo;
276			}
277		}
278	}
279
280	assert(sna->render.nvertex_reloc);
281	for (i = 0; i < sna->render.nvertex_reloc; i++) {
282		DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
283		     i, sna->render.vertex_reloc[i]));
284
285		sna->kgem.batch[sna->render.vertex_reloc[i]] =
286			kgem_add_reloc(&sna->kgem,
287				       sna->render.vertex_reloc[i], bo,
288				       I915_GEM_DOMAIN_VERTEX << 16,
289				       delta);
290	}
291	sna->render.nvertex_reloc = 0;
292	sna->render.vb_id = 0;
293
294	if (sna->render.vbo == NULL) {
295		assert(!sna->render.active);
296		sna->render.vertex_used = 0;
297		sna->render.vertex_index = 0;
298		assert(sna->render.vertices == sna->render.vertex_data);
299		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
300	}
301
302	if (free_bo)
303		kgem_bo_destroy(&sna->kgem, free_bo);
304}
305
306/* specialised vertex emission routines */
307
308#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y) /* XXX assert(!too_large(x, y)); */
309#define OUT_VERTEX_F(v) vertex_emit(sna, v)
310
311force_inline static float
312compute_linear(const struct sna_composite_channel *channel,
313	       int16_t x, int16_t y)
314{
315	return ((x+channel->offset[0]) * channel->u.linear.dx +
316		(y+channel->offset[1]) * channel->u.linear.dy +
317		channel->u.linear.offset);
318}
319
320sse2 inline static void
321emit_texcoord(struct sna *sna,
322	      const struct sna_composite_channel *channel,
323	      int16_t x, int16_t y)
324{
325	if (channel->is_solid) {
326		OUT_VERTEX_F(0.5);
327		return;
328	}
329
330	x += channel->offset[0];
331	y += channel->offset[1];
332
333	if (channel->is_affine) {
334		float s, t;
335
336		sna_get_transformed_coordinates(x, y,
337						channel->transform,
338						&s, &t);
339		OUT_VERTEX_F(s * channel->scale[0]);
340		OUT_VERTEX_F(t * channel->scale[1]);
341	} else {
342		float s, t, w;
343
344		sna_get_transformed_coordinates_3d(x, y,
345						   channel->transform,
346						   &s, &t, &w);
347		OUT_VERTEX_F(s * channel->scale[0]);
348		OUT_VERTEX_F(t * channel->scale[1]);
349		OUT_VERTEX_F(w);
350	}
351}
352
353sse2 force_inline static void
354emit_vertex(struct sna *sna,
355	    const struct sna_composite_op *op,
356	    int16_t srcX, int16_t srcY,
357	    int16_t mskX, int16_t mskY,
358	    int16_t dstX, int16_t dstY)
359{
360	OUT_VERTEX(dstX, dstY);
361	emit_texcoord(sna, &op->src, srcX, srcY);
362}
363
364sse2 fastcall static void
365emit_primitive(struct sna *sna,
366	       const struct sna_composite_op *op,
367	       const struct sna_composite_rectangles *r)
368{
369	emit_vertex(sna, op,
370		    r->src.x + r->width,  r->src.y + r->height,
371		    r->mask.x + r->width, r->mask.y + r->height,
372		    r->dst.x + r->width, r->dst.y + r->height);
373	emit_vertex(sna, op,
374		    r->src.x,  r->src.y + r->height,
375		    r->mask.x, r->mask.y + r->height,
376		    r->dst.x,  r->dst.y + r->height);
377	emit_vertex(sna, op,
378		    r->src.x,  r->src.y,
379		    r->mask.x, r->mask.y,
380		    r->dst.x,  r->dst.y);
381}
382
383sse2 inline static float *
384vemit_texcoord(float *v,
385	      const struct sna_composite_channel *channel,
386	      int16_t x, int16_t y)
387{
388	if (channel->is_solid) {
389		*v++ = 0.5;
390	} else {
391		x += channel->offset[0];
392		y += channel->offset[1];
393
394		if (channel->is_affine) {
395			float s, t;
396
397			sna_get_transformed_coordinates(x, y,
398							channel->transform,
399							&s, &t);
400			*v++ = s * channel->scale[0];
401			*v++ = t * channel->scale[1];
402		} else {
403			float s, t, w;
404
405			sna_get_transformed_coordinates_3d(x, y,
406							   channel->transform,
407							   &s, &t, &w);
408			*v++ = s * channel->scale[0];
409			*v++ = t * channel->scale[1];
410			*v++ = w;
411		}
412	}
413
414	return v;
415}
416
417sse2 force_inline static float *
418vemit_vertex(float *v,
419	     const struct sna_composite_op *op,
420	     int16_t x, int16_t y)
421{
422	*v++ = pack_2s(x, y);
423	return vemit_texcoord(v, &op->src, x, y);
424}
425
426sse2 fastcall static void
427emit_boxes(const struct sna_composite_op *op,
428	   const BoxRec *box, int nbox,
429	   float *v)
430{
431	do {
432		v = vemit_vertex(v, op, box->x2, box->y2);
433		v = vemit_vertex(v, op, box->x1, box->y2);
434		v = vemit_vertex(v, op, box->x1, box->y1);
435
436		box++;
437	} while (--nbox);
438}
439
440sse2 force_inline static void
441emit_vertex_mask(struct sna *sna,
442		 const struct sna_composite_op *op,
443		 int16_t srcX, int16_t srcY,
444		 int16_t mskX, int16_t mskY,
445		 int16_t dstX, int16_t dstY)
446{
447	OUT_VERTEX(dstX, dstY);
448	emit_texcoord(sna, &op->src, srcX, srcY);
449	emit_texcoord(sna, &op->mask, mskX, mskY);
450}
451
452sse2 fastcall static void
453emit_primitive_mask(struct sna *sna,
454		    const struct sna_composite_op *op,
455		    const struct sna_composite_rectangles *r)
456{
457	emit_vertex_mask(sna, op,
458			 r->src.x + r->width,  r->src.y + r->height,
459			 r->mask.x + r->width, r->mask.y + r->height,
460			 r->dst.x + r->width, r->dst.y + r->height);
461	emit_vertex_mask(sna, op,
462			 r->src.x,  r->src.y + r->height,
463			 r->mask.x, r->mask.y + r->height,
464			 r->dst.x,  r->dst.y + r->height);
465	emit_vertex_mask(sna, op,
466			 r->src.x,  r->src.y,
467			 r->mask.x, r->mask.y,
468			 r->dst.x,  r->dst.y);
469}
470
471sse2 force_inline static float *
472vemit_vertex_mask(float *v,
473		  const struct sna_composite_op *op,
474		  int16_t x, int16_t y)
475{
476	*v++ = pack_2s(x, y);
477	v = vemit_texcoord(v, &op->src, x, y);
478	v = vemit_texcoord(v, &op->mask, x, y);
479	return v;
480}
481
482sse2 fastcall static void
483emit_boxes_mask(const struct sna_composite_op *op,
484		const BoxRec *box, int nbox,
485		float *v)
486{
487	do {
488		v = vemit_vertex_mask(v, op, box->x2, box->y2);
489		v = vemit_vertex_mask(v, op, box->x1, box->y2);
490		v = vemit_vertex_mask(v, op, box->x1, box->y1);
491
492		box++;
493	} while (--nbox);
494}
495
496
497sse2 fastcall static void
498emit_primitive_solid(struct sna *sna,
499		     const struct sna_composite_op *op,
500		     const struct sna_composite_rectangles *r)
501{
502	float *v;
503	union {
504		struct sna_coordinate p;
505		float f;
506	} dst;
507
508	assert(op->floats_per_rect == 6);
509	assert((sna->render.vertex_used % 2) == 0);
510	v = sna->render.vertices + sna->render.vertex_used;
511	sna->render.vertex_used += 6;
512	assert(sna->render.vertex_used <= sna->render.vertex_size);
513
514	dst.p.x = r->dst.x + r->width;
515	dst.p.y = r->dst.y + r->height;
516	v[0] = dst.f;
517	dst.p.x = r->dst.x;
518	v[2] = dst.f;
519	dst.p.y = r->dst.y;
520	v[4] = dst.f;
521
522	v[5] = v[3] = v[1] = .5;
523}
524
525sse2 fastcall static void
526emit_boxes_solid(const struct sna_composite_op *op,
527		 const BoxRec *box, int nbox,
528		 float *v)
529{
530	do {
531		union {
532			struct sna_coordinate p;
533			float f;
534		} dst;
535
536		dst.p.x = box->x2;
537		dst.p.y = box->y2;
538		v[0] = dst.f;
539		dst.p.x = box->x1;
540		v[2] = dst.f;
541		dst.p.y = box->y1;
542		v[4] = dst.f;
543
544		v[5] = v[3] = v[1] = .5;
545		box++;
546		v += 6;
547	} while (--nbox);
548}
549
550sse2 fastcall static void
551emit_primitive_linear(struct sna *sna,
552		      const struct sna_composite_op *op,
553		      const struct sna_composite_rectangles *r)
554{
555	float *v;
556	union {
557		struct sna_coordinate p;
558		float f;
559	} dst;
560
561	assert(op->floats_per_rect == 6);
562	assert((sna->render.vertex_used % 2) == 0);
563	v = sna->render.vertices + sna->render.vertex_used;
564	sna->render.vertex_used += 6;
565	assert(sna->render.vertex_used <= sna->render.vertex_size);
566
567	dst.p.x = r->dst.x + r->width;
568	dst.p.y = r->dst.y + r->height;
569	v[0] = dst.f;
570	dst.p.x = r->dst.x;
571	v[2] = dst.f;
572	dst.p.y = r->dst.y;
573	v[4] = dst.f;
574
575	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
576	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
577	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
578}
579
580sse2 fastcall static void
581emit_boxes_linear(const struct sna_composite_op *op,
582		  const BoxRec *box, int nbox,
583		  float *v)
584{
585	union {
586		struct sna_coordinate p;
587		float f;
588	} dst;
589
590	do {
591		dst.p.x = box->x2;
592		dst.p.y = box->y2;
593		v[0] = dst.f;
594		dst.p.x = box->x1;
595		v[2] = dst.f;
596		dst.p.y = box->y1;
597		v[4] = dst.f;
598
599		v[1] = compute_linear(&op->src, box->x2, box->y2);
600		v[3] = compute_linear(&op->src, box->x1, box->y2);
601		v[5] = compute_linear(&op->src, box->x1, box->y1);
602
603		v += 6;
604		box++;
605	} while (--nbox);
606}
607
608sse2 fastcall static void
609emit_primitive_identity_source(struct sna *sna,
610			       const struct sna_composite_op *op,
611			       const struct sna_composite_rectangles *r)
612{
613	union {
614		struct sna_coordinate p;
615		float f;
616	} dst;
617	float *v;
618
619	assert(op->floats_per_rect == 9);
620	assert((sna->render.vertex_used % 3) == 0);
621	v = sna->render.vertices + sna->render.vertex_used;
622	sna->render.vertex_used += 9;
623
624	dst.p.x = r->dst.x + r->width;
625	dst.p.y = r->dst.y + r->height;
626	v[0] = dst.f;
627	dst.p.x = r->dst.x;
628	v[3] = dst.f;
629	dst.p.y = r->dst.y;
630	v[6] = dst.f;
631
632	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
633	v[1] = v[4] + r->width * op->src.scale[0];
634
635	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
636	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
637}
638
639sse2 fastcall static void
640emit_boxes_identity_source(const struct sna_composite_op *op,
641			   const BoxRec *box, int nbox,
642			   float *v)
643{
644	do {
645		union {
646			struct sna_coordinate p;
647			float f;
648		} dst;
649
650		dst.p.x = box->x2;
651		dst.p.y = box->y2;
652		v[0] = dst.f;
653		dst.p.x = box->x1;
654		v[3] = dst.f;
655		dst.p.y = box->y1;
656		v[6] = dst.f;
657
658		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
659		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
660
661		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
662		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
663
664		v += 9;
665		box++;
666	} while (--nbox);
667}
668
669sse2 fastcall static void
670emit_primitive_simple_source(struct sna *sna,
671			     const struct sna_composite_op *op,
672			     const struct sna_composite_rectangles *r)
673{
674	float *v;
675	union {
676		struct sna_coordinate p;
677		float f;
678	} dst;
679
680	float xx = op->src.transform->matrix[0][0];
681	float x0 = op->src.transform->matrix[0][2];
682	float yy = op->src.transform->matrix[1][1];
683	float y0 = op->src.transform->matrix[1][2];
684	float sx = op->src.scale[0];
685	float sy = op->src.scale[1];
686	int16_t tx = op->src.offset[0];
687	int16_t ty = op->src.offset[1];
688
689	assert(op->floats_per_rect == 9);
690	assert((sna->render.vertex_used % 3) == 0);
691	v = sna->render.vertices + sna->render.vertex_used;
692	sna->render.vertex_used += 3*3;
693
694	dst.p.x = r->dst.x + r->width;
695	dst.p.y = r->dst.y + r->height;
696	v[0] = dst.f;
697	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
698	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
699
700	dst.p.x = r->dst.x;
701	v[3] = dst.f;
702	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
703
704	dst.p.y = r->dst.y;
705	v[6] = dst.f;
706	v[8] = ((r->src.y + ty) * yy + y0) * sy;
707}
708
709sse2 fastcall static void
710emit_boxes_simple_source(const struct sna_composite_op *op,
711			 const BoxRec *box, int nbox,
712			 float *v)
713{
714	float xx = op->src.transform->matrix[0][0];
715	float x0 = op->src.transform->matrix[0][2];
716	float yy = op->src.transform->matrix[1][1];
717	float y0 = op->src.transform->matrix[1][2];
718	float sx = op->src.scale[0];
719	float sy = op->src.scale[1];
720	int16_t tx = op->src.offset[0];
721	int16_t ty = op->src.offset[1];
722
723	do {
724		union {
725			struct sna_coordinate p;
726			float f;
727		} dst;
728
729		dst.p.x = box->x2;
730		dst.p.y = box->y2;
731		v[0] = dst.f;
732		v[1] = ((box->x2 + tx) * xx + x0) * sx;
733		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
734
735		dst.p.x = box->x1;
736		v[3] = dst.f;
737		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
738
739		dst.p.y = box->y1;
740		v[6] = dst.f;
741		v[8] = ((box->y1 + ty) * yy + y0) * sy;
742
743		v += 9;
744		box++;
745	} while (--nbox);
746}
747
748sse2 fastcall static void
749emit_primitive_affine_source(struct sna *sna,
750			     const struct sna_composite_op *op,
751			     const struct sna_composite_rectangles *r)
752{
753	union {
754		struct sna_coordinate p;
755		float f;
756	} dst;
757	float *v;
758
759	assert(op->floats_per_rect == 9);
760	assert((sna->render.vertex_used % 3) == 0);
761	v = sna->render.vertices + sna->render.vertex_used;
762	sna->render.vertex_used += 9;
763
764	dst.p.x = r->dst.x + r->width;
765	dst.p.y = r->dst.y + r->height;
766	v[0] = dst.f;
767	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x + r->width,
768				    op->src.offset[1] + r->src.y + r->height,
769				    op->src.transform, op->src.scale,
770				    &v[1], &v[2]);
771
772	dst.p.x = r->dst.x;
773	v[3] = dst.f;
774	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
775				    op->src.offset[1] + r->src.y + r->height,
776				    op->src.transform, op->src.scale,
777				    &v[4], &v[5]);
778
779	dst.p.y = r->dst.y;
780	v[6] = dst.f;
781	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
782				    op->src.offset[1] + r->src.y,
783				    op->src.transform, op->src.scale,
784				    &v[7], &v[8]);
785}
786
787sse2 fastcall static void
788emit_boxes_affine_source(const struct sna_composite_op *op,
789			 const BoxRec *box, int nbox,
790			 float *v)
791{
792	do {
793		union {
794			struct sna_coordinate p;
795			float f;
796		} dst;
797
798		dst.p.x = box->x2;
799		dst.p.y = box->y2;
800		v[0] = dst.f;
801		_sna_get_transformed_scaled(op->src.offset[0] + box->x2,
802					    op->src.offset[1] + box->y2,
803					    op->src.transform, op->src.scale,
804					    &v[1], &v[2]);
805
806		dst.p.x = box->x1;
807		v[3] = dst.f;
808		_sna_get_transformed_scaled(op->src.offset[0] + box->x1,
809					    op->src.offset[1] + box->y2,
810					    op->src.transform, op->src.scale,
811					    &v[4], &v[5]);
812
813		dst.p.y = box->y1;
814		v[6] = dst.f;
815		_sna_get_transformed_scaled(op->src.offset[0] + box->x1,
816					    op->src.offset[1] + box->y1,
817					    op->src.transform, op->src.scale,
818					    &v[7], &v[8]);
819		box++;
820		v += 9;
821	} while (--nbox);
822}
823
824sse2 fastcall static void
825emit_primitive_identity_mask(struct sna *sna,
826			     const struct sna_composite_op *op,
827			     const struct sna_composite_rectangles *r)
828{
829	union {
830		struct sna_coordinate p;
831		float f;
832	} dst;
833	float msk_x, msk_y;
834	float w, h;
835	float *v;
836
837	msk_x = r->mask.x + op->mask.offset[0];
838	msk_y = r->mask.y + op->mask.offset[1];
839	w = r->width;
840	h = r->height;
841
842	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
843	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
844
845	assert(op->floats_per_rect == 12);
846	assert((sna->render.vertex_used % 4) == 0);
847	v = sna->render.vertices + sna->render.vertex_used;
848	sna->render.vertex_used += 12;
849
850	dst.p.x = r->dst.x + r->width;
851	dst.p.y = r->dst.y + r->height;
852	v[0] = dst.f;
853	v[2] = (msk_x + w) * op->mask.scale[0];
854	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
855
856	dst.p.x = r->dst.x;
857	v[4] = dst.f;
858	v[10] = v[6] = msk_x * op->mask.scale[0];
859
860	dst.p.y = r->dst.y;
861	v[8] = dst.f;
862	v[11] = msk_y * op->mask.scale[1];
863
864	v[9] = v[5] = v[1] = .5;
865}
866
867sse2 fastcall static void
868emit_boxes_identity_mask(const struct sna_composite_op *op,
869			 const BoxRec *box, int nbox,
870			 float *v)
871{
872	float msk_x = op->mask.offset[0];
873	float msk_y = op->mask.offset[1];
874
875	do {
876		union {
877			struct sna_coordinate p;
878			float f;
879		} dst;
880
881		dst.p.x = box->x2;
882		dst.p.y = box->y2;
883		v[0] = dst.f;
884		v[2] = (msk_x + box->x2) * op->mask.scale[0];
885		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
886
887		dst.p.x = box->x1;
888		v[4] = dst.f;
889		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
890
891		dst.p.y = box->y1;
892		v[8] = dst.f;
893		v[11] = (msk_y + box->y1) * op->mask.scale[1];
894
895		v[9] = v[5] = v[1] = .5;
896		v += 12;
897		box++;
898	} while (--nbox);
899}
900
901sse2 fastcall static void
902emit_primitive_linear_identity_mask(struct sna *sna,
903				    const struct sna_composite_op *op,
904				    const struct sna_composite_rectangles *r)
905{
906	union {
907		struct sna_coordinate p;
908		float f;
909	} dst;
910	float msk_x, msk_y;
911	float w, h;
912	float *v;
913
914	msk_x = r->mask.x + op->mask.offset[0];
915	msk_y = r->mask.y + op->mask.offset[1];
916	w = r->width;
917	h = r->height;
918
919	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
920	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
921
922	assert(op->floats_per_rect == 12);
923	assert((sna->render.vertex_used % 4) == 0);
924	v = sna->render.vertices + sna->render.vertex_used;
925	sna->render.vertex_used += 12;
926
927	dst.p.x = r->dst.x + r->width;
928	dst.p.y = r->dst.y + r->height;
929	v[0] = dst.f;
930	v[2] = (msk_x + w) * op->mask.scale[0];
931	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
932
933	dst.p.x = r->dst.x;
934	v[4] = dst.f;
935	v[10] = v[6] = msk_x * op->mask.scale[0];
936
937	dst.p.y = r->dst.y;
938	v[8] = dst.f;
939	v[11] = msk_y * op->mask.scale[1];
940
941	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
942	v[5] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
943	v[9] = compute_linear(&op->src, r->src.x, r->src.y);
944}
945
946sse2 fastcall static void
947emit_boxes_linear_identity_mask(const struct sna_composite_op *op,
948				const BoxRec *box, int nbox,
949				float *v)
950{
951	float msk_x = op->mask.offset[0];
952	float msk_y = op->mask.offset[1];
953
954	do {
955		union {
956			struct sna_coordinate p;
957			float f;
958		} dst;
959
960		dst.p.x = box->x2;
961		dst.p.y = box->y2;
962		v[0] = dst.f;
963		v[2] = (msk_x + box->x2) * op->mask.scale[0];
964		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
965
966		dst.p.x = box->x1;
967		v[4] = dst.f;
968		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
969
970		dst.p.y = box->y1;
971		v[8] = dst.f;
972		v[11] = (msk_y + box->y1) * op->mask.scale[1];
973
974		v[1] = compute_linear(&op->src, box->x2, box->y2);
975		v[5] = compute_linear(&op->src, box->x1, box->y2);
976		v[9] = compute_linear(&op->src, box->x1, box->y1);
977
978		v += 12;
979		box++;
980	} while (--nbox);
981}
982
983sse2 fastcall static void
984emit_primitive_identity_source_mask(struct sna *sna,
985				    const struct sna_composite_op *op,
986				    const struct sna_composite_rectangles *r)
987{
988	union {
989		struct sna_coordinate p;
990		float f;
991	} dst;
992	float src_x, src_y;
993	float msk_x, msk_y;
994	float w, h;
995	float *v;
996
997	src_x = r->src.x + op->src.offset[0];
998	src_y = r->src.y + op->src.offset[1];
999	msk_x = r->mask.x + op->mask.offset[0];
1000	msk_y = r->mask.y + op->mask.offset[1];
1001	w = r->width;
1002	h = r->height;
1003
1004	assert(op->floats_per_rect == 15);
1005	assert((sna->render.vertex_used % 5) == 0);
1006	v = sna->render.vertices + sna->render.vertex_used;
1007	sna->render.vertex_used += 15;
1008
1009	dst.p.x = r->dst.x + r->width;
1010	dst.p.y = r->dst.y + r->height;
1011	v[0] = dst.f;
1012	v[1] = (src_x + w) * op->src.scale[0];
1013	v[2] = (src_y + h) * op->src.scale[1];
1014	v[3] = (msk_x + w) * op->mask.scale[0];
1015	v[4] = (msk_y + h) * op->mask.scale[1];
1016
1017	dst.p.x = r->dst.x;
1018	v[5] = dst.f;
1019	v[6] = src_x * op->src.scale[0];
1020	v[7] = v[2];
1021	v[8] = msk_x * op->mask.scale[0];
1022	v[9] = v[4];
1023
1024	dst.p.y = r->dst.y;
1025	v[10] = dst.f;
1026	v[11] = v[6];
1027	v[12] = src_y * op->src.scale[1];
1028	v[13] = v[8];
1029	v[14] = msk_y * op->mask.scale[1];
1030}
1031
1032sse2 fastcall static void
1033emit_primitive_simple_source_identity(struct sna *sna,
1034				      const struct sna_composite_op *op,
1035				      const struct sna_composite_rectangles *r)
1036{
1037	float *v;
1038	union {
1039		struct sna_coordinate p;
1040		float f;
1041	} dst;
1042
1043	float xx = op->src.transform->matrix[0][0];
1044	float x0 = op->src.transform->matrix[0][2];
1045	float yy = op->src.transform->matrix[1][1];
1046	float y0 = op->src.transform->matrix[1][2];
1047	float sx = op->src.scale[0];
1048	float sy = op->src.scale[1];
1049	int16_t tx = op->src.offset[0];
1050	int16_t ty = op->src.offset[1];
1051	float msk_x = r->mask.x + op->mask.offset[0];
1052	float msk_y = r->mask.y + op->mask.offset[1];
1053	float w = r->width, h = r->height;
1054
1055	assert(op->floats_per_rect == 15);
1056	assert((sna->render.vertex_used % 5) == 0);
1057	v = sna->render.vertices + sna->render.vertex_used;
1058	sna->render.vertex_used += 3*5;
1059
1060	dst.p.x = r->dst.x + r->width;
1061	dst.p.y = r->dst.y + r->height;
1062	v[0] = dst.f;
1063	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
1064	v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
1065	v[3] = (msk_x + w) * op->mask.scale[0];
1066	v[4] = (msk_y + h) * op->mask.scale[1];
1067
1068	dst.p.x = r->dst.x;
1069	v[5] = dst.f;
1070	v[6] = ((r->src.x + tx) * xx + x0) * sx;
1071	v[7] = v[2];
1072	v[8] = msk_x * op->mask.scale[0];
1073	v[9] = v[4];
1074
1075	dst.p.y = r->dst.y;
1076	v[10] = dst.f;
1077	v[11] = v[6];
1078	v[12] = ((r->src.y + ty) * yy + y0) * sy;
1079	v[13] = v[8];
1080	v[14] = msk_y * op->mask.scale[1];
1081}
1082
1083sse2 fastcall static void
1084emit_primitive_affine_source_identity(struct sna *sna,
1085				      const struct sna_composite_op *op,
1086				      const struct sna_composite_rectangles *r)
1087{
1088	float *v;
1089	union {
1090		struct sna_coordinate p;
1091		float f;
1092	} dst;
1093	float msk_x = r->mask.x + op->mask.offset[0];
1094	float msk_y = r->mask.y + op->mask.offset[1];
1095	float w = r->width, h = r->height;
1096
1097	assert(op->floats_per_rect == 15);
1098	assert((sna->render.vertex_used % 5) == 0);
1099	v = sna->render.vertices + sna->render.vertex_used;
1100	sna->render.vertex_used += 3*5;
1101
1102	dst.p.x = r->dst.x + r->width;
1103	dst.p.y = r->dst.y + r->height;
1104	v[0] = dst.f;
1105	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x + r->width,
1106				    op->src.offset[1] + r->src.y + r->height,
1107				    op->src.transform, op->src.scale,
1108				    &v[1], &v[2]);
1109	v[3] = (msk_x + w) * op->mask.scale[0];
1110	v[4] = (msk_y + h) * op->mask.scale[1];
1111
1112	dst.p.x = r->dst.x;
1113	v[5] = dst.f;
1114	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
1115				    op->src.offset[1] + r->src.y + r->height,
1116				    op->src.transform, op->src.scale,
1117				    &v[6], &v[7]);
1118	v[8] = msk_x * op->mask.scale[0];
1119	v[9] = v[4];
1120
1121	dst.p.y = r->dst.y;
1122	v[10] = dst.f;
1123	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
1124				    op->src.offset[1] + r->src.y,
1125				    op->src.transform, op->src.scale,
1126				    &v[11], &v[12]);
1127	v[13] = v[8];
1128	v[14] = msk_y * op->mask.scale[1];
1129}
1130
1131/* SSE4_2 */
1132#if defined(sse4_2)
1133
1134sse4_2 fastcall static void
1135emit_primitive_linear__sse4_2(struct sna *sna,
1136			      const struct sna_composite_op *op,
1137			      const struct sna_composite_rectangles *r)
1138{
1139	float *v;
1140	union {
1141		struct sna_coordinate p;
1142		float f;
1143	} dst;
1144
1145	assert(op->floats_per_rect == 6);
1146	assert((sna->render.vertex_used % 2) == 0);
1147	v = sna->render.vertices + sna->render.vertex_used;
1148	sna->render.vertex_used += 6;
1149	assert(sna->render.vertex_used <= sna->render.vertex_size);
1150
1151	dst.p.x = r->dst.x + r->width;
1152	dst.p.y = r->dst.y + r->height;
1153	v[0] = dst.f;
1154	dst.p.x = r->dst.x;
1155	v[2] = dst.f;
1156	dst.p.y = r->dst.y;
1157	v[4] = dst.f;
1158
1159	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
1160	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
1161	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
1162}
1163
1164sse4_2 fastcall static void
1165emit_boxes_linear__sse4_2(const struct sna_composite_op *op,
1166			  const BoxRec *box, int nbox,
1167			  float *v)
1168{
1169	union {
1170		struct sna_coordinate p;
1171		float f;
1172	} dst;
1173
1174	do {
1175		dst.p.x = box->x2;
1176		dst.p.y = box->y2;
1177		v[0] = dst.f;
1178		dst.p.x = box->x1;
1179		v[2] = dst.f;
1180		dst.p.y = box->y1;
1181		v[4] = dst.f;
1182
1183		v[1] = compute_linear(&op->src, box->x2, box->y2);
1184		v[3] = compute_linear(&op->src, box->x1, box->y2);
1185		v[5] = compute_linear(&op->src, box->x1, box->y1);
1186
1187		v += 6;
1188		box++;
1189	} while (--nbox);
1190}
1191
1192sse4_2 fastcall static void
1193emit_primitive_identity_source__sse4_2(struct sna *sna,
1194				       const struct sna_composite_op *op,
1195				       const struct sna_composite_rectangles *r)
1196{
1197	union {
1198		struct sna_coordinate p;
1199		float f;
1200	} dst;
1201	float *v;
1202
1203	assert(op->floats_per_rect == 9);
1204	assert((sna->render.vertex_used % 3) == 0);
1205	v = sna->render.vertices + sna->render.vertex_used;
1206	sna->render.vertex_used += 9;
1207
1208	dst.p.x = r->dst.x + r->width;
1209	dst.p.y = r->dst.y + r->height;
1210	v[0] = dst.f;
1211	dst.p.x = r->dst.x;
1212	v[3] = dst.f;
1213	dst.p.y = r->dst.y;
1214	v[6] = dst.f;
1215
1216	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
1217	v[1] = v[4] + r->width * op->src.scale[0];
1218
1219	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
1220	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
1221}
1222
1223sse4_2 fastcall static void
1224emit_boxes_identity_source__sse4_2(const struct sna_composite_op *op,
1225				   const BoxRec *box, int nbox,
1226				   float *v)
1227{
1228	do {
1229		union {
1230			struct sna_coordinate p;
1231			float f;
1232		} dst;
1233
1234		dst.p.x = box->x2;
1235		dst.p.y = box->y2;
1236		v[0] = dst.f;
1237		dst.p.x = box->x1;
1238		v[3] = dst.f;
1239		dst.p.y = box->y1;
1240		v[6] = dst.f;
1241
1242		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
1243		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
1244
1245		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
1246		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
1247
1248		v += 9;
1249		box++;
1250	} while (--nbox);
1251}
1252
1253sse4_2 fastcall static void
1254emit_primitive_simple_source__sse4_2(struct sna *sna,
1255				     const struct sna_composite_op *op,
1256				     const struct sna_composite_rectangles *r)
1257{
1258	float *v;
1259	union {
1260		struct sna_coordinate p;
1261		float f;
1262	} dst;
1263
1264	float xx = op->src.transform->matrix[0][0];
1265	float x0 = op->src.transform->matrix[0][2];
1266	float yy = op->src.transform->matrix[1][1];
1267	float y0 = op->src.transform->matrix[1][2];
1268	float sx = op->src.scale[0];
1269	float sy = op->src.scale[1];
1270	int16_t tx = op->src.offset[0];
1271	int16_t ty = op->src.offset[1];
1272
1273	assert(op->floats_per_rect == 9);
1274	assert((sna->render.vertex_used % 3) == 0);
1275	v = sna->render.vertices + sna->render.vertex_used;
1276	sna->render.vertex_used += 3*3;
1277
1278	dst.p.x = r->dst.x + r->width;
1279	dst.p.y = r->dst.y + r->height;
1280	v[0] = dst.f;
1281	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
1282	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
1283
1284	dst.p.x = r->dst.x;
1285	v[3] = dst.f;
1286	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
1287
1288	dst.p.y = r->dst.y;
1289	v[6] = dst.f;
1290	v[8] = ((r->src.y + ty) * yy + y0) * sy;
1291}
1292
1293sse4_2 fastcall static void
1294emit_boxes_simple_source__sse4_2(const struct sna_composite_op *op,
1295				 const BoxRec *box, int nbox,
1296				 float *v)
1297{
1298	float xx = op->src.transform->matrix[0][0];
1299	float x0 = op->src.transform->matrix[0][2];
1300	float yy = op->src.transform->matrix[1][1];
1301	float y0 = op->src.transform->matrix[1][2];
1302	float sx = op->src.scale[0];
1303	float sy = op->src.scale[1];
1304	int16_t tx = op->src.offset[0];
1305	int16_t ty = op->src.offset[1];
1306
1307	do {
1308		union {
1309			struct sna_coordinate p;
1310			float f;
1311		} dst;
1312
1313		dst.p.x = box->x2;
1314		dst.p.y = box->y2;
1315		v[0] = dst.f;
1316		v[1] = ((box->x2 + tx) * xx + x0) * sx;
1317		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
1318
1319		dst.p.x = box->x1;
1320		v[3] = dst.f;
1321		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
1322
1323		dst.p.y = box->y1;
1324		v[6] = dst.f;
1325		v[8] = ((box->y1 + ty) * yy + y0) * sy;
1326
1327		v += 9;
1328		box++;
1329	} while (--nbox);
1330}
1331
1332sse4_2 fastcall static void
1333emit_primitive_identity_mask__sse4_2(struct sna *sna,
1334				     const struct sna_composite_op *op,
1335				     const struct sna_composite_rectangles *r)
1336{
1337	union {
1338		struct sna_coordinate p;
1339		float f;
1340	} dst;
1341	float msk_x, msk_y;
1342	float w, h;
1343	float *v;
1344
1345	msk_x = r->mask.x + op->mask.offset[0];
1346	msk_y = r->mask.y + op->mask.offset[1];
1347	w = r->width;
1348	h = r->height;
1349
1350	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
1351	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
1352
1353	assert(op->floats_per_rect == 12);
1354	assert((sna->render.vertex_used % 4) == 0);
1355	v = sna->render.vertices + sna->render.vertex_used;
1356	sna->render.vertex_used += 12;
1357
1358	dst.p.x = r->dst.x + r->width;
1359	dst.p.y = r->dst.y + r->height;
1360	v[0] = dst.f;
1361	v[2] = (msk_x + w) * op->mask.scale[0];
1362	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
1363
1364	dst.p.x = r->dst.x;
1365	v[4] = dst.f;
1366	v[10] = v[6] = msk_x * op->mask.scale[0];
1367
1368	dst.p.y = r->dst.y;
1369	v[8] = dst.f;
1370	v[11] = msk_y * op->mask.scale[1];
1371
1372	v[9] = v[5] = v[1] = .5;
1373}
1374
1375sse4_2 fastcall static void
1376emit_boxes_identity_mask__sse4_2(const struct sna_composite_op *op,
1377				 const BoxRec *box, int nbox,
1378				 float *v)
1379{
1380	float msk_x = op->mask.offset[0];
1381	float msk_y = op->mask.offset[1];
1382
1383	do {
1384		union {
1385			struct sna_coordinate p;
1386			float f;
1387		} dst;
1388
1389		dst.p.x = box->x2;
1390		dst.p.y = box->y2;
1391		v[0] = dst.f;
1392		v[2] = (msk_x + box->x2) * op->mask.scale[0];
1393		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
1394
1395		dst.p.x = box->x1;
1396		v[4] = dst.f;
1397		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
1398
1399		dst.p.y = box->y1;
1400		v[8] = dst.f;
1401		v[11] = (msk_y + box->y1) * op->mask.scale[1];
1402
1403		v[9] = v[5] = v[1] = .5;
1404		v += 12;
1405		box++;
1406	} while (--nbox);
1407}
1408
1409sse4_2 fastcall static void
1410emit_primitive_linear_identity_mask__sse4_2(struct sna *sna,
1411					    const struct sna_composite_op *op,
1412					    const struct sna_composite_rectangles *r)
1413{
1414	union {
1415		struct sna_coordinate p;
1416		float f;
1417	} dst;
1418	float msk_x, msk_y;
1419	float w, h;
1420	float *v;
1421
1422	msk_x = r->mask.x + op->mask.offset[0];
1423	msk_y = r->mask.y + op->mask.offset[1];
1424	w = r->width;
1425	h = r->height;
1426
1427	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
1428	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
1429
1430	assert(op->floats_per_rect == 12);
1431	assert((sna->render.vertex_used % 4) == 0);
1432	v = sna->render.vertices + sna->render.vertex_used;
1433	sna->render.vertex_used += 12;
1434
1435	dst.p.x = r->dst.x + r->width;
1436	dst.p.y = r->dst.y + r->height;
1437	v[0] = dst.f;
1438	v[2] = (msk_x + w) * op->mask.scale[0];
1439	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
1440
1441	dst.p.x = r->dst.x;
1442	v[4] = dst.f;
1443	v[10] = v[6] = msk_x * op->mask.scale[0];
1444
1445	dst.p.y = r->dst.y;
1446	v[8] = dst.f;
1447	v[11] = msk_y * op->mask.scale[1];
1448
1449	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
1450	v[5] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
1451	v[9] = compute_linear(&op->src, r->src.x, r->src.y);
1452}
1453
1454sse4_2 fastcall static void
1455emit_boxes_linear_identity_mask__sse4_2(const struct sna_composite_op *op,
1456					const BoxRec *box, int nbox,
1457					float *v)
1458{
1459	float msk_x = op->mask.offset[0];
1460	float msk_y = op->mask.offset[1];
1461
1462	do {
1463		union {
1464			struct sna_coordinate p;
1465			float f;
1466		} dst;
1467
1468		dst.p.x = box->x2;
1469		dst.p.y = box->y2;
1470		v[0] = dst.f;
1471		v[2] = (msk_x + box->x2) * op->mask.scale[0];
1472		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
1473
1474		dst.p.x = box->x1;
1475		v[4] = dst.f;
1476		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
1477
1478		dst.p.y = box->y1;
1479		v[8] = dst.f;
1480		v[11] = (msk_y + box->y1) * op->mask.scale[1];
1481
1482		v[1] = compute_linear(&op->src, box->x2, box->y2);
1483		v[5] = compute_linear(&op->src, box->x1, box->y2);
1484		v[9] = compute_linear(&op->src, box->x1, box->y1);
1485
1486		v += 12;
1487		box++;
1488	} while (--nbox);
1489}
1490
1491#endif
1492
1493/* AVX2 */
1494#if defined(avx2)
1495
1496avx2 fastcall static void
1497emit_primitive_linear__avx2(struct sna *sna,
1498			    const struct sna_composite_op *op,
1499			    const struct sna_composite_rectangles *r)
1500{
1501	float *v;
1502	union {
1503		struct sna_coordinate p;
1504		float f;
1505	} dst;
1506
1507	assert(op->floats_per_rect == 6);
1508	assert((sna->render.vertex_used % 2) == 0);
1509	v = sna->render.vertices + sna->render.vertex_used;
1510	sna->render.vertex_used += 6;
1511	assert(sna->render.vertex_used <= sna->render.vertex_size);
1512
1513	dst.p.x = r->dst.x + r->width;
1514	dst.p.y = r->dst.y + r->height;
1515	v[0] = dst.f;
1516	dst.p.x = r->dst.x;
1517	v[2] = dst.f;
1518	dst.p.y = r->dst.y;
1519	v[4] = dst.f;
1520
1521	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
1522	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
1523	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
1524}
1525
1526avx2 fastcall static void
1527emit_boxes_linear__avx2(const struct sna_composite_op *op,
1528			const BoxRec *box, int nbox,
1529			float *v)
1530{
1531	union {
1532		struct sna_coordinate p;
1533		float f;
1534	} dst;
1535
1536	do {
1537		dst.p.x = box->x2;
1538		dst.p.y = box->y2;
1539		v[0] = dst.f;
1540		dst.p.x = box->x1;
1541		v[2] = dst.f;
1542		dst.p.y = box->y1;
1543		v[4] = dst.f;
1544
1545		v[1] = compute_linear(&op->src, box->x2, box->y2);
1546		v[3] = compute_linear(&op->src, box->x1, box->y2);
1547		v[5] = compute_linear(&op->src, box->x1, box->y1);
1548
1549		v += 6;
1550		box++;
1551	} while (--nbox);
1552}
1553
1554avx2 fastcall static void
1555emit_primitive_identity_source__avx2(struct sna *sna,
1556				     const struct sna_composite_op *op,
1557				     const struct sna_composite_rectangles *r)
1558{
1559	union {
1560		struct sna_coordinate p;
1561		float f;
1562	} dst;
1563	float *v;
1564
1565	assert(op->floats_per_rect == 9);
1566	assert((sna->render.vertex_used % 3) == 0);
1567	v = sna->render.vertices + sna->render.vertex_used;
1568	sna->render.vertex_used += 9;
1569
1570	dst.p.x = r->dst.x + r->width;
1571	dst.p.y = r->dst.y + r->height;
1572	v[0] = dst.f;
1573	dst.p.x = r->dst.x;
1574	v[3] = dst.f;
1575	dst.p.y = r->dst.y;
1576	v[6] = dst.f;
1577
1578	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
1579	v[1] = v[4] + r->width * op->src.scale[0];
1580
1581	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
1582	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
1583}
1584
1585avx2 fastcall static void
1586emit_boxes_identity_source__avx2(const struct sna_composite_op *op,
1587				 const BoxRec *box, int nbox,
1588				 float *v)
1589{
1590	do {
1591		union {
1592			struct sna_coordinate p;
1593			float f;
1594		} dst;
1595
1596		dst.p.x = box->x2;
1597		dst.p.y = box->y2;
1598		v[0] = dst.f;
1599		dst.p.x = box->x1;
1600		v[3] = dst.f;
1601		dst.p.y = box->y1;
1602		v[6] = dst.f;
1603
1604		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
1605		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
1606
1607		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
1608		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
1609
1610		v += 9;
1611		box++;
1612	} while (--nbox);
1613}
1614
1615avx2 fastcall static void
1616emit_primitive_simple_source__avx2(struct sna *sna,
1617				   const struct sna_composite_op *op,
1618				   const struct sna_composite_rectangles *r)
1619{
1620	float *v;
1621	union {
1622		struct sna_coordinate p;
1623		float f;
1624	} dst;
1625
1626	float xx = op->src.transform->matrix[0][0];
1627	float x0 = op->src.transform->matrix[0][2];
1628	float yy = op->src.transform->matrix[1][1];
1629	float y0 = op->src.transform->matrix[1][2];
1630	float sx = op->src.scale[0];
1631	float sy = op->src.scale[1];
1632	int16_t tx = op->src.offset[0];
1633	int16_t ty = op->src.offset[1];
1634
1635	assert(op->floats_per_rect == 9);
1636	assert((sna->render.vertex_used % 3) == 0);
1637	v = sna->render.vertices + sna->render.vertex_used;
1638	sna->render.vertex_used += 3*3;
1639
1640	dst.p.x = r->dst.x + r->width;
1641	dst.p.y = r->dst.y + r->height;
1642	v[0] = dst.f;
1643	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
1644	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
1645
1646	dst.p.x = r->dst.x;
1647	v[3] = dst.f;
1648	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
1649
1650	dst.p.y = r->dst.y;
1651	v[6] = dst.f;
1652	v[8] = ((r->src.y + ty) * yy + y0) * sy;
1653}
1654
1655avx2 fastcall static void
1656emit_boxes_simple_source__avx2(const struct sna_composite_op *op,
1657			       const BoxRec *box, int nbox,
1658			       float *v)
1659{
1660	float xx = op->src.transform->matrix[0][0];
1661	float x0 = op->src.transform->matrix[0][2];
1662	float yy = op->src.transform->matrix[1][1];
1663	float y0 = op->src.transform->matrix[1][2];
1664	float sx = op->src.scale[0];
1665	float sy = op->src.scale[1];
1666	int16_t tx = op->src.offset[0];
1667	int16_t ty = op->src.offset[1];
1668
1669	do {
1670		union {
1671			struct sna_coordinate p;
1672			float f;
1673		} dst;
1674
1675		dst.p.x = box->x2;
1676		dst.p.y = box->y2;
1677		v[0] = dst.f;
1678		v[1] = ((box->x2 + tx) * xx + x0) * sx;
1679		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
1680
1681		dst.p.x = box->x1;
1682		v[3] = dst.f;
1683		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
1684
1685		dst.p.y = box->y1;
1686		v[6] = dst.f;
1687		v[8] = ((box->y1 + ty) * yy + y0) * sy;
1688
1689		v += 9;
1690		box++;
1691	} while (--nbox);
1692}
1693
1694avx2 fastcall static void
1695emit_primitive_identity_mask__avx2(struct sna *sna,
1696				   const struct sna_composite_op *op,
1697				   const struct sna_composite_rectangles *r)
1698{
1699	union {
1700		struct sna_coordinate p;
1701		float f;
1702	} dst;
1703	float msk_x, msk_y;
1704	float w, h;
1705	float *v;
1706
1707	msk_x = r->mask.x + op->mask.offset[0];
1708	msk_y = r->mask.y + op->mask.offset[1];
1709	w = r->width;
1710	h = r->height;
1711
1712	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
1713	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
1714
1715	assert(op->floats_per_rect == 12);
1716	assert((sna->render.vertex_used % 4) == 0);
1717	v = sna->render.vertices + sna->render.vertex_used;
1718	sna->render.vertex_used += 12;
1719
1720	dst.p.x = r->dst.x + r->width;
1721	dst.p.y = r->dst.y + r->height;
1722	v[0] = dst.f;
1723	v[2] = (msk_x + w) * op->mask.scale[0];
1724	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
1725
1726	dst.p.x = r->dst.x;
1727	v[4] = dst.f;
1728	v[10] = v[6] = msk_x * op->mask.scale[0];
1729
1730	dst.p.y = r->dst.y;
1731	v[8] = dst.f;
1732	v[11] = msk_y * op->mask.scale[1];
1733
1734	v[9] = v[5] = v[1] = .5;
1735}
1736
1737avx2 fastcall static void
1738emit_boxes_identity_mask__avx2(const struct sna_composite_op *op,
1739			       const BoxRec *box, int nbox,
1740			       float *v)
1741{
1742	float msk_x = op->mask.offset[0];
1743	float msk_y = op->mask.offset[1];
1744
1745	do {
1746		union {
1747			struct sna_coordinate p;
1748			float f;
1749		} dst;
1750
1751		dst.p.x = box->x2;
1752		dst.p.y = box->y2;
1753		v[0] = dst.f;
1754		v[2] = (msk_x + box->x2) * op->mask.scale[0];
1755		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
1756
1757		dst.p.x = box->x1;
1758		v[4] = dst.f;
1759		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
1760
1761		dst.p.y = box->y1;
1762		v[8] = dst.f;
1763		v[11] = (msk_y + box->y1) * op->mask.scale[1];
1764
1765		v[9] = v[5] = v[1] = .5;
1766		v += 12;
1767		box++;
1768	} while (--nbox);
1769}
1770
1771avx2 fastcall static void
1772emit_primitive_linear_identity_mask__avx2(struct sna *sna,
1773					  const struct sna_composite_op *op,
1774					  const struct sna_composite_rectangles *r)
1775{
1776	union {
1777		struct sna_coordinate p;
1778		float f;
1779	} dst;
1780	float msk_x, msk_y;
1781	float w, h;
1782	float *v;
1783
1784	msk_x = r->mask.x + op->mask.offset[0];
1785	msk_y = r->mask.y + op->mask.offset[1];
1786	w = r->width;
1787	h = r->height;
1788
1789	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
1790	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
1791
1792	assert(op->floats_per_rect == 12);
1793	assert((sna->render.vertex_used % 4) == 0);
1794	v = sna->render.vertices + sna->render.vertex_used;
1795	sna->render.vertex_used += 12;
1796
1797	dst.p.x = r->dst.x + r->width;
1798	dst.p.y = r->dst.y + r->height;
1799	v[0] = dst.f;
1800	v[2] = (msk_x + w) * op->mask.scale[0];
1801	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
1802
1803	dst.p.x = r->dst.x;
1804	v[4] = dst.f;
1805	v[10] = v[6] = msk_x * op->mask.scale[0];
1806
1807	dst.p.y = r->dst.y;
1808	v[8] = dst.f;
1809	v[11] = msk_y * op->mask.scale[1];
1810
1811	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
1812	v[5] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
1813	v[9] = compute_linear(&op->src, r->src.x, r->src.y);
1814}
1815
1816avx2 fastcall static void
1817emit_boxes_linear_identity_mask__avx2(const struct sna_composite_op *op,
1818				      const BoxRec *box, int nbox,
1819				      float *v)
1820{
1821	float msk_x = op->mask.offset[0];
1822	float msk_y = op->mask.offset[1];
1823
1824	do {
1825		union {
1826			struct sna_coordinate p;
1827			float f;
1828		} dst;
1829
1830		dst.p.x = box->x2;
1831		dst.p.y = box->y2;
1832		v[0] = dst.f;
1833		v[2] = (msk_x + box->x2) * op->mask.scale[0];
1834		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
1835
1836		dst.p.x = box->x1;
1837		v[4] = dst.f;
1838		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
1839
1840		dst.p.y = box->y1;
1841		v[8] = dst.f;
1842		v[11] = (msk_y + box->y1) * op->mask.scale[1];
1843
1844		v[1] = compute_linear(&op->src, box->x2, box->y2);
1845		v[5] = compute_linear(&op->src, box->x1, box->y2);
1846		v[9] = compute_linear(&op->src, box->x1, box->y1);
1847
1848		v += 12;
1849		box++;
1850	} while (--nbox);
1851}
1852
1853#endif
1854
1855unsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp)
1856{
1857	unsigned vb;
1858
1859	if (tmp->mask.bo) {
1860		if (tmp->mask.transform == NULL) {
1861			if (tmp->src.is_solid) {
1862				DBG(("%s: solid, identity mask\n", __FUNCTION__));
1863#if defined(avx2)
1864				if (sna->cpu_features & AVX2) {
1865					tmp->prim_emit = emit_primitive_identity_mask__avx2;
1866					tmp->emit_boxes = emit_boxes_identity_mask__avx2;
1867				} else
1868#endif
1869#if defined(sse4_2)
1870				if (sna->cpu_features & SSE4_2) {
1871					tmp->prim_emit = emit_primitive_identity_mask__sse4_2;
1872					tmp->emit_boxes = emit_boxes_identity_mask__sse4_2;
1873				} else
1874#endif
1875				{
1876					tmp->prim_emit = emit_primitive_identity_mask;
1877					tmp->emit_boxes = emit_boxes_identity_mask;
1878				}
1879				tmp->floats_per_vertex = 4;
1880				vb = 1 | 2 << 2;
1881			} else if (tmp->src.is_linear) {
1882				DBG(("%s: linear, identity mask\n", __FUNCTION__));
1883#if defined(avx2)
1884				if (sna->cpu_features & AVX2) {
1885					tmp->prim_emit = emit_primitive_linear_identity_mask__avx2;
1886					tmp->emit_boxes = emit_boxes_linear_identity_mask__avx2;
1887				} else
1888#endif
1889#if defined(sse4_2)
1890				if (sna->cpu_features & SSE4_2) {
1891					tmp->prim_emit = emit_primitive_linear_identity_mask__sse4_2;
1892					tmp->emit_boxes = emit_boxes_linear_identity_mask__sse4_2;
1893				} else
1894#endif
1895				{
1896					tmp->prim_emit = emit_primitive_linear_identity_mask;
1897					tmp->emit_boxes = emit_boxes_linear_identity_mask;
1898				}
1899				tmp->floats_per_vertex = 4;
1900				vb = 1 | 2 << 2;
1901			} else if (tmp->src.transform == NULL) {
1902				DBG(("%s: identity source, identity mask\n", __FUNCTION__));
1903				tmp->prim_emit = emit_primitive_identity_source_mask;
1904				tmp->floats_per_vertex = 5;
1905				vb = 2 << 2 | 2;
1906			} else if (tmp->src.is_affine) {
1907				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
1908				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
1909				if (!sna_affine_transform_is_rotation(tmp->src.transform)) {
1910					DBG(("%s: simple src, identity mask\n", __FUNCTION__));
1911					tmp->prim_emit = emit_primitive_simple_source_identity;
1912				} else {
1913					DBG(("%s: affine src, identity mask\n", __FUNCTION__));
1914					tmp->prim_emit = emit_primitive_affine_source_identity;
1915				}
1916				tmp->floats_per_vertex = 5;
1917				vb = 2 << 2 | 2;
1918			} else {
1919				DBG(("%s: projective source, identity mask\n", __FUNCTION__));
1920				tmp->prim_emit = emit_primitive_mask;
1921				tmp->floats_per_vertex = 6;
1922				vb = 2 << 2 | 3;
1923			}
1924		} else {
1925			tmp->prim_emit = emit_primitive_mask;
1926			tmp->emit_boxes = emit_boxes_mask;
1927			tmp->floats_per_vertex = 1;
1928			vb = 0;
1929			if (tmp->mask.is_solid) {
1930				tmp->floats_per_vertex += 1;
1931				vb |= 1 << 2;
1932			} else if (tmp->mask.is_affine) {
1933				tmp->floats_per_vertex += 2;
1934				vb |= 2 << 2;
1935			}else {
1936				tmp->floats_per_vertex += 3;
1937				vb |= 3 << 2;
1938			}
1939			if (tmp->src.is_solid) {
1940				tmp->floats_per_vertex += 1;
1941				vb |= 1;
1942			} else if (tmp->src.is_affine) {
1943				tmp->floats_per_vertex += 2;
1944				vb |= 2 ;
1945			}else {
1946				tmp->floats_per_vertex += 3;
1947				vb |= 3;
1948			}
1949			DBG(("%s: general mask: floats-per-vertex=%d, vb=%x\n",
1950			     __FUNCTION__,tmp->floats_per_vertex, vb));
1951		}
1952	} else {
1953		if (tmp->src.is_solid) {
1954			DBG(("%s: solid, no mask\n", __FUNCTION__));
1955			tmp->prim_emit = emit_primitive_solid;
1956			tmp->emit_boxes = emit_boxes_solid;
1957			if (tmp->src.is_opaque && tmp->op == PictOpOver)
1958				tmp->op = PictOpSrc;
1959			tmp->floats_per_vertex = 2;
1960			vb = 1;
1961		} else if (tmp->src.is_linear) {
1962			DBG(("%s: linear, no mask\n", __FUNCTION__));
1963#if defined(avx2)
1964			if (sna->cpu_features & AVX2) {
1965				tmp->prim_emit = emit_primitive_linear__avx2;
1966				tmp->emit_boxes = emit_boxes_linear__avx2;
1967			} else
1968#endif
1969#if defined(sse4_2)
1970			if (sna->cpu_features & SSE4_2) {
1971				tmp->prim_emit = emit_primitive_linear__sse4_2;
1972				tmp->emit_boxes = emit_boxes_linear__sse4_2;
1973			} else
1974#endif
1975			{
1976				tmp->prim_emit = emit_primitive_linear;
1977				tmp->emit_boxes = emit_boxes_linear;
1978			}
1979			tmp->floats_per_vertex = 2;
1980			vb = 1;
1981		} else if (tmp->src.transform == NULL) {
1982			DBG(("%s: identity src, no mask\n", __FUNCTION__));
1983#if defined(avx2)
1984			if (sna->cpu_features & AVX2) {
1985				tmp->prim_emit = emit_primitive_identity_source__avx2;
1986				tmp->emit_boxes = emit_boxes_identity_source__avx2;
1987			} else
1988#endif
1989#if defined(sse4_2)
1990			if (sna->cpu_features & SSE4_2) {
1991				tmp->prim_emit = emit_primitive_identity_source__sse4_2;
1992				tmp->emit_boxes = emit_boxes_identity_source__sse4_2;
1993			} else
1994#endif
1995			{
1996				tmp->prim_emit = emit_primitive_identity_source;
1997				tmp->emit_boxes = emit_boxes_identity_source;
1998			}
1999			tmp->floats_per_vertex = 3;
2000			vb = 2;
2001		} else if (tmp->src.is_affine) {
2002			tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
2003			tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
2004			if (!sna_affine_transform_is_rotation(tmp->src.transform)) {
2005				DBG(("%s: simple src, no mask\n", __FUNCTION__));
2006#if defined(avx2)
2007				if (sna->cpu_features & AVX2) {
2008					tmp->prim_emit = emit_primitive_simple_source__avx2;
2009					tmp->emit_boxes = emit_boxes_simple_source__avx2;
2010				} else
2011#endif
2012#if defined(sse4_2)
2013				if (sna->cpu_features & SSE4_2) {
2014					tmp->prim_emit = emit_primitive_simple_source__sse4_2;
2015					tmp->emit_boxes = emit_boxes_simple_source__sse4_2;
2016				} else
2017#endif
2018				{
2019					tmp->prim_emit = emit_primitive_simple_source;
2020					tmp->emit_boxes = emit_boxes_simple_source;
2021				}
2022			} else {
2023				DBG(("%s: affine src, no mask\n", __FUNCTION__));
2024				tmp->prim_emit = emit_primitive_affine_source;
2025				tmp->emit_boxes = emit_boxes_affine_source;
2026			}
2027			tmp->floats_per_vertex = 3;
2028			vb = 2;
2029		} else {
2030			DBG(("%s: projective src, no mask\n", __FUNCTION__));
2031			assert(!tmp->src.is_solid);
2032			tmp->prim_emit = emit_primitive;
2033			tmp->emit_boxes = emit_boxes;
2034			tmp->floats_per_vertex = 4;
2035			vb = 3;
2036		}
2037	}
2038	tmp->floats_per_rect = 3 * tmp->floats_per_vertex;
2039
2040	return vb;
2041}
2042
2043sse2 force_inline static void
2044emit_span_vertex(struct sna *sna,
2045		  const struct sna_composite_spans_op *op,
2046		  int16_t x, int16_t y)
2047{
2048	OUT_VERTEX(x, y);
2049	emit_texcoord(sna, &op->base.src, x, y);
2050}
2051
2052sse2 fastcall static void
2053emit_span_primitive(struct sna *sna,
2054		    const struct sna_composite_spans_op *op,
2055		    const BoxRec *box,
2056		    float opacity)
2057{
2058	emit_span_vertex(sna, op, box->x2, box->y2);
2059	OUT_VERTEX_F(opacity);
2060
2061	emit_span_vertex(sna, op, box->x1, box->y2);
2062	OUT_VERTEX_F(opacity);
2063
2064	emit_span_vertex(sna, op, box->x1, box->y1);
2065	OUT_VERTEX_F(opacity);
2066}
2067
2068sse2 fastcall static void
2069emit_span_boxes(const struct sna_composite_spans_op *op,
2070		const struct sna_opacity_box *b, int nbox,
2071		float *v)
2072{
2073	do {
2074		v = vemit_vertex(v, &op->base, b->box.x2, b->box.y2);
2075		*v++ = b->alpha;
2076
2077		v = vemit_vertex(v, &op->base, b->box.x1, b->box.y2);
2078		*v++ = b->alpha;
2079
2080		v = vemit_vertex(v, &op->base, b->box.x1, b->box.y1);
2081		*v++ = b->alpha;
2082
2083		b++;
2084	} while (--nbox);
2085}
2086
2087sse2 fastcall static void
2088emit_span_solid(struct sna *sna,
2089		const struct sna_composite_spans_op *op,
2090		const BoxRec *box,
2091		float opacity)
2092{
2093	float *v;
2094	union {
2095		struct sna_coordinate p;
2096		float f;
2097	} dst;
2098
2099	assert(op->base.floats_per_rect == 9);
2100	assert((sna->render.vertex_used % 3) == 0);
2101	v = sna->render.vertices + sna->render.vertex_used;
2102	sna->render.vertex_used += 3*3;
2103
2104	dst.p.x = box->x2;
2105	dst.p.y = box->y2;
2106	v[0] = dst.f;
2107
2108	dst.p.x = box->x1;
2109	v[3] = dst.f;
2110
2111	dst.p.y = box->y1;
2112	v[6] = dst.f;
2113
2114	v[7] = v[4] = v[1] = .5;
2115	v[8] = v[5] = v[2] = opacity;
2116}
2117
2118sse2 fastcall static void
2119emit_span_boxes_solid(const struct sna_composite_spans_op *op,
2120		      const struct sna_opacity_box *b,
2121		      int nbox, float *v)
2122{
2123	do {
2124		union {
2125			struct sna_coordinate p;
2126			float f;
2127		} dst;
2128
2129		dst.p.x = b->box.x2;
2130		dst.p.y = b->box.y2;
2131		v[0] = dst.f;
2132
2133		dst.p.x = b->box.x1;
2134		v[3] = dst.f;
2135
2136		dst.p.y = b->box.y1;
2137		v[6] = dst.f;
2138
2139		v[7] = v[4] = v[1] = .5;
2140		v[8] = v[5] = v[2] = b->alpha;
2141
2142		v += 9;
2143		b++;
2144	} while (--nbox);
2145}
2146
2147sse2 fastcall static void
2148emit_span_identity(struct sna *sna,
2149		    const struct sna_composite_spans_op *op,
2150		    const BoxRec *box,
2151		    float opacity)
2152{
2153	float *v;
2154	union {
2155		struct sna_coordinate p;
2156		float f;
2157	} dst;
2158
2159	float sx = op->base.src.scale[0];
2160	float sy = op->base.src.scale[1];
2161	int16_t tx = op->base.src.offset[0];
2162	int16_t ty = op->base.src.offset[1];
2163
2164	assert(op->base.floats_per_rect == 12);
2165	assert((sna->render.vertex_used % 4) == 0);
2166	v = sna->render.vertices + sna->render.vertex_used;
2167	sna->render.vertex_used += 3*4;
2168	assert(sna->render.vertex_used <= sna->render.vertex_size);
2169
2170	dst.p.x = box->x2;
2171	dst.p.y = box->y2;
2172	v[0] = dst.f;
2173	v[1] = (box->x2 + tx) * sx;
2174	v[6] = v[2] = (box->y2 + ty) * sy;
2175
2176	dst.p.x = box->x1;
2177	v[4] = dst.f;
2178	v[9] = v[5] = (box->x1 + tx) * sx;
2179
2180	dst.p.y = box->y1;
2181	v[8] = dst.f;
2182	v[10] = (box->y1 + ty) * sy;
2183
2184	v[11] = v[7] = v[3] = opacity;
2185}
2186
2187sse2 fastcall static void
2188emit_span_boxes_identity(const struct sna_composite_spans_op *op,
2189			 const struct sna_opacity_box *b, int nbox,
2190			 float *v)
2191{
2192	do {
2193		union {
2194			struct sna_coordinate p;
2195			float f;
2196		} dst;
2197
2198		float sx = op->base.src.scale[0];
2199		float sy = op->base.src.scale[1];
2200		int16_t tx = op->base.src.offset[0];
2201		int16_t ty = op->base.src.offset[1];
2202
2203		dst.p.x = b->box.x2;
2204		dst.p.y = b->box.y2;
2205		v[0] = dst.f;
2206		v[1] = (b->box.x2 + tx) * sx;
2207		v[6] = v[2] = (b->box.y2 + ty) * sy;
2208
2209		dst.p.x = b->box.x1;
2210		v[4] = dst.f;
2211		v[9] = v[5] = (b->box.x1 + tx) * sx;
2212
2213		dst.p.y = b->box.y1;
2214		v[8] = dst.f;
2215		v[10] = (b->box.y1 + ty) * sy;
2216
2217		v[11] = v[7] = v[3] = b->alpha;
2218
2219		v += 12;
2220		b++;
2221	} while (--nbox);
2222}
2223
2224sse2 fastcall static void
2225emit_span_simple(struct sna *sna,
2226		 const struct sna_composite_spans_op *op,
2227		 const BoxRec *box,
2228		 float opacity)
2229{
2230	float *v;
2231	union {
2232		struct sna_coordinate p;
2233		float f;
2234	} dst;
2235
2236	float xx = op->base.src.transform->matrix[0][0];
2237	float x0 = op->base.src.transform->matrix[0][2];
2238	float yy = op->base.src.transform->matrix[1][1];
2239	float y0 = op->base.src.transform->matrix[1][2];
2240	float sx = op->base.src.scale[0];
2241	float sy = op->base.src.scale[1];
2242	int16_t tx = op->base.src.offset[0];
2243	int16_t ty = op->base.src.offset[1];
2244
2245	assert(op->base.floats_per_rect == 12);
2246	assert((sna->render.vertex_used % 4) == 0);
2247	v = sna->render.vertices + sna->render.vertex_used;
2248	sna->render.vertex_used += 3*4;
2249	assert(sna->render.vertex_used <= sna->render.vertex_size);
2250
2251	dst.p.x = box->x2;
2252	dst.p.y = box->y2;
2253	v[0] = dst.f;
2254	v[1] = ((box->x2 + tx) * xx + x0) * sx;
2255	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
2256
2257	dst.p.x = box->x1;
2258	v[4] = dst.f;
2259	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
2260
2261	dst.p.y = box->y1;
2262	v[8] = dst.f;
2263	v[10] = ((box->y1 + ty) * yy + y0) * sy;
2264
2265	v[11] = v[7] = v[3] = opacity;
2266}
2267
2268sse2 fastcall static void
2269emit_span_boxes_simple(const struct sna_composite_spans_op *op,
2270		       const struct sna_opacity_box *b, int nbox,
2271		       float *v)
2272{
2273	float xx = op->base.src.transform->matrix[0][0];
2274	float x0 = op->base.src.transform->matrix[0][2];
2275	float yy = op->base.src.transform->matrix[1][1];
2276	float y0 = op->base.src.transform->matrix[1][2];
2277	float sx = op->base.src.scale[0];
2278	float sy = op->base.src.scale[1];
2279	int16_t tx = op->base.src.offset[0];
2280	int16_t ty = op->base.src.offset[1];
2281
2282	do {
2283		union {
2284			struct sna_coordinate p;
2285			float f;
2286		} dst;
2287
2288		dst.p.x = b->box.x2;
2289		dst.p.y = b->box.y2;
2290		v[0] = dst.f;
2291		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
2292		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
2293
2294		dst.p.x = b->box.x1;
2295		v[4] = dst.f;
2296		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
2297
2298		dst.p.y = b->box.y1;
2299		v[8] = dst.f;
2300		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
2301
2302		v[11] = v[7] = v[3] = b->alpha;
2303
2304		v += 12;
2305		b++;
2306	} while (--nbox);
2307}
2308
2309sse2 fastcall static void
2310emit_span_affine(struct sna *sna,
2311		  const struct sna_composite_spans_op *op,
2312		  const BoxRec *box,
2313		  float opacity)
2314{
2315	union {
2316		struct sna_coordinate p;
2317		float f;
2318	} dst;
2319	float *v;
2320
2321	assert(op->base.floats_per_rect == 12);
2322	assert((sna->render.vertex_used % 4) == 0);
2323	v = sna->render.vertices + sna->render.vertex_used;
2324	sna->render.vertex_used += 12;
2325
2326	dst.p.x = box->x2;
2327	dst.p.y = box->y2;
2328	v[0] = dst.f;
2329	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
2330				    op->base.src.offset[1] + box->y2,
2331				    op->base.src.transform,
2332				    op->base.src.scale,
2333				    &v[1], &v[2]);
2334
2335	dst.p.x = box->x1;
2336	v[4] = dst.f;
2337	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2338				    op->base.src.offset[1] + box->y2,
2339				    op->base.src.transform,
2340				    op->base.src.scale,
2341				    &v[5], &v[6]);
2342
2343	dst.p.y = box->y1;
2344	v[8] = dst.f;
2345	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2346				    op->base.src.offset[1] + box->y1,
2347				    op->base.src.transform,
2348				    op->base.src.scale,
2349				    &v[9], &v[10]);
2350
2351	v[11] = v[7] = v[3] = opacity;
2352}
2353
2354sse2 fastcall static void
2355emit_span_boxes_affine(const struct sna_composite_spans_op *op,
2356		       const struct sna_opacity_box *b, int nbox,
2357		       float *v)
2358{
2359	do {
2360		union {
2361			struct sna_coordinate p;
2362			float f;
2363		} dst;
2364
2365		dst.p.x = b->box.x2;
2366		dst.p.y = b->box.y2;
2367		v[0] = dst.f;
2368		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
2369					    op->base.src.offset[1] + b->box.y2,
2370					    op->base.src.transform,
2371					    op->base.src.scale,
2372					    &v[1], &v[2]);
2373
2374		dst.p.x = b->box.x1;
2375		v[4] = dst.f;
2376		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
2377					    op->base.src.offset[1] + b->box.y2,
2378					    op->base.src.transform,
2379					    op->base.src.scale,
2380					    &v[5], &v[6]);
2381
2382		dst.p.y = b->box.y1;
2383		v[8] = dst.f;
2384		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
2385					    op->base.src.offset[1] + b->box.y1,
2386					    op->base.src.transform,
2387					    op->base.src.scale,
2388					    &v[9], &v[10]);
2389
2390		v[11] = v[7] = v[3] = b->alpha;
2391
2392		v += 12;
2393		b++;
2394	} while (--nbox);
2395}
2396
2397sse2 fastcall static void
2398emit_span_linear(struct sna *sna,
2399		 const struct sna_composite_spans_op *op,
2400		 const BoxRec *box,
2401		 float opacity)
2402{
2403	union {
2404		struct sna_coordinate p;
2405		float f;
2406	} dst;
2407	float *v;
2408
2409	assert(op->base.floats_per_rect == 9);
2410	assert((sna->render.vertex_used % 3) == 0);
2411	v = sna->render.vertices + sna->render.vertex_used;
2412	sna->render.vertex_used += 9;
2413
2414	dst.p.x = box->x2;
2415	dst.p.y = box->y2;
2416	v[0] = dst.f;
2417	dst.p.x = box->x1;
2418	v[3] = dst.f;
2419	dst.p.y = box->y1;
2420	v[6] = dst.f;
2421
2422	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
2423	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
2424	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
2425
2426	v[8] = v[5] = v[2] = opacity;
2427}
2428
2429sse2 fastcall static void
2430emit_span_boxes_linear(const struct sna_composite_spans_op *op,
2431		       const struct sna_opacity_box *b, int nbox,
2432		       float *v)
2433{
2434	do {
2435		union {
2436			struct sna_coordinate p;
2437			float f;
2438		} dst;
2439
2440		dst.p.x = b->box.x2;
2441		dst.p.y = b->box.y2;
2442		v[0] = dst.f;
2443		dst.p.x = b->box.x1;
2444		v[3] = dst.f;
2445		dst.p.y = b->box.y1;
2446		v[6] = dst.f;
2447
2448		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
2449		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
2450		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
2451
2452		v[8] = v[5] = v[2] = b->alpha;
2453
2454		v += 9;
2455		b++;
2456	} while (--nbox);
2457}
2458
2459/* SSE4_2 */
2460#if defined(sse4_2)
2461
2462sse4_2 fastcall static void
2463emit_span_identity__sse4_2(struct sna *sna,
2464			   const struct sna_composite_spans_op *op,
2465			   const BoxRec *box,
2466			   float opacity)
2467{
2468	float *v;
2469	union {
2470		struct sna_coordinate p;
2471		float f;
2472	} dst;
2473
2474	float sx = op->base.src.scale[0];
2475	float sy = op->base.src.scale[1];
2476	int16_t tx = op->base.src.offset[0];
2477	int16_t ty = op->base.src.offset[1];
2478
2479	assert(op->base.floats_per_rect == 12);
2480	assert((sna->render.vertex_used % 4) == 0);
2481	v = sna->render.vertices + sna->render.vertex_used;
2482	sna->render.vertex_used += 3*4;
2483	assert(sna->render.vertex_used <= sna->render.vertex_size);
2484
2485	dst.p.x = box->x2;
2486	dst.p.y = box->y2;
2487	v[0] = dst.f;
2488	v[1] = (box->x2 + tx) * sx;
2489	v[6] = v[2] = (box->y2 + ty) * sy;
2490
2491	dst.p.x = box->x1;
2492	v[4] = dst.f;
2493	v[9] = v[5] = (box->x1 + tx) * sx;
2494
2495	dst.p.y = box->y1;
2496	v[8] = dst.f;
2497	v[10] = (box->y1 + ty) * sy;
2498
2499	v[11] = v[7] = v[3] = opacity;
2500}
2501
2502sse4_2 fastcall static void
2503emit_span_boxes_identity__sse4_2(const struct sna_composite_spans_op *op,
2504				 const struct sna_opacity_box *b, int nbox,
2505				 float *v)
2506{
2507	do {
2508		union {
2509			struct sna_coordinate p;
2510			float f;
2511		} dst;
2512
2513		float sx = op->base.src.scale[0];
2514		float sy = op->base.src.scale[1];
2515		int16_t tx = op->base.src.offset[0];
2516		int16_t ty = op->base.src.offset[1];
2517
2518		dst.p.x = b->box.x2;
2519		dst.p.y = b->box.y2;
2520		v[0] = dst.f;
2521		v[1] = (b->box.x2 + tx) * sx;
2522		v[6] = v[2] = (b->box.y2 + ty) * sy;
2523
2524		dst.p.x = b->box.x1;
2525		v[4] = dst.f;
2526		v[9] = v[5] = (b->box.x1 + tx) * sx;
2527
2528		dst.p.y = b->box.y1;
2529		v[8] = dst.f;
2530		v[10] = (b->box.y1 + ty) * sy;
2531
2532		v[11] = v[7] = v[3] = b->alpha;
2533
2534		v += 12;
2535		b++;
2536	} while (--nbox);
2537}
2538
2539sse4_2 fastcall static void
2540emit_span_simple__sse4_2(struct sna *sna,
2541			 const struct sna_composite_spans_op *op,
2542			 const BoxRec *box,
2543			 float opacity)
2544{
2545	float *v;
2546	union {
2547		struct sna_coordinate p;
2548		float f;
2549	} dst;
2550
2551	float xx = op->base.src.transform->matrix[0][0];
2552	float x0 = op->base.src.transform->matrix[0][2];
2553	float yy = op->base.src.transform->matrix[1][1];
2554	float y0 = op->base.src.transform->matrix[1][2];
2555	float sx = op->base.src.scale[0];
2556	float sy = op->base.src.scale[1];
2557	int16_t tx = op->base.src.offset[0];
2558	int16_t ty = op->base.src.offset[1];
2559
2560	assert(op->base.floats_per_rect == 12);
2561	assert((sna->render.vertex_used % 4) == 0);
2562	v = sna->render.vertices + sna->render.vertex_used;
2563	sna->render.vertex_used += 3*4;
2564	assert(sna->render.vertex_used <= sna->render.vertex_size);
2565
2566	dst.p.x = box->x2;
2567	dst.p.y = box->y2;
2568	v[0] = dst.f;
2569	v[1] = ((box->x2 + tx) * xx + x0) * sx;
2570	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
2571
2572	dst.p.x = box->x1;
2573	v[4] = dst.f;
2574	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
2575
2576	dst.p.y = box->y1;
2577	v[8] = dst.f;
2578	v[10] = ((box->y1 + ty) * yy + y0) * sy;
2579
2580	v[11] = v[7] = v[3] = opacity;
2581}
2582
2583sse4_2 fastcall static void
2584emit_span_boxes_simple__sse4_2(const struct sna_composite_spans_op *op,
2585			       const struct sna_opacity_box *b, int nbox,
2586			       float *v)
2587{
2588	float xx = op->base.src.transform->matrix[0][0];
2589	float x0 = op->base.src.transform->matrix[0][2];
2590	float yy = op->base.src.transform->matrix[1][1];
2591	float y0 = op->base.src.transform->matrix[1][2];
2592	float sx = op->base.src.scale[0];
2593	float sy = op->base.src.scale[1];
2594	int16_t tx = op->base.src.offset[0];
2595	int16_t ty = op->base.src.offset[1];
2596
2597	do {
2598		union {
2599			struct sna_coordinate p;
2600			float f;
2601		} dst;
2602
2603		dst.p.x = b->box.x2;
2604		dst.p.y = b->box.y2;
2605		v[0] = dst.f;
2606		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
2607		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
2608
2609		dst.p.x = b->box.x1;
2610		v[4] = dst.f;
2611		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
2612
2613		dst.p.y = b->box.y1;
2614		v[8] = dst.f;
2615		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
2616
2617		v[11] = v[7] = v[3] = b->alpha;
2618
2619		v += 12;
2620		b++;
2621	} while (--nbox);
2622}
2623
2624sse4_2 fastcall static void
2625emit_span_affine__sse4_2(struct sna *sna,
2626			 const struct sna_composite_spans_op *op,
2627			 const BoxRec *box,
2628			 float opacity)
2629{
2630	union {
2631		struct sna_coordinate p;
2632		float f;
2633	} dst;
2634	float *v;
2635
2636	assert(op->base.floats_per_rect == 12);
2637	assert((sna->render.vertex_used % 4) == 0);
2638	v = sna->render.vertices + sna->render.vertex_used;
2639	sna->render.vertex_used += 12;
2640
2641	dst.p.x = box->x2;
2642	dst.p.y = box->y2;
2643	v[0] = dst.f;
2644	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
2645				    op->base.src.offset[1] + box->y2,
2646				    op->base.src.transform,
2647				    op->base.src.scale,
2648				    &v[1], &v[2]);
2649
2650	dst.p.x = box->x1;
2651	v[4] = dst.f;
2652	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2653				    op->base.src.offset[1] + box->y2,
2654				    op->base.src.transform,
2655				    op->base.src.scale,
2656				    &v[5], &v[6]);
2657
2658	dst.p.y = box->y1;
2659	v[8] = dst.f;
2660	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2661				    op->base.src.offset[1] + box->y1,
2662				    op->base.src.transform,
2663				    op->base.src.scale,
2664				    &v[9], &v[10]);
2665
2666	v[11] = v[7] = v[3] = opacity;
2667}
2668
2669sse4_2 fastcall static void
2670emit_span_boxes_affine__sse4_2(const struct sna_composite_spans_op *op,
2671			       const struct sna_opacity_box *b, int nbox,
2672			       float *v)
2673{
2674	do {
2675		union {
2676			struct sna_coordinate p;
2677			float f;
2678		} dst;
2679
2680		dst.p.x = b->box.x2;
2681		dst.p.y = b->box.y2;
2682		v[0] = dst.f;
2683		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
2684					    op->base.src.offset[1] + b->box.y2,
2685					    op->base.src.transform,
2686					    op->base.src.scale,
2687					    &v[1], &v[2]);
2688
2689		dst.p.x = b->box.x1;
2690		v[4] = dst.f;
2691		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
2692					    op->base.src.offset[1] + b->box.y2,
2693					    op->base.src.transform,
2694					    op->base.src.scale,
2695					    &v[5], &v[6]);
2696
2697		dst.p.y = b->box.y1;
2698		v[8] = dst.f;
2699		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
2700					    op->base.src.offset[1] + b->box.y1,
2701					    op->base.src.transform,
2702					    op->base.src.scale,
2703					    &v[9], &v[10]);
2704
2705		v[11] = v[7] = v[3] = b->alpha;
2706
2707		v += 12;
2708		b++;
2709	} while (--nbox);
2710}
2711
2712sse4_2 fastcall static void
2713emit_span_linear__sse4_2(struct sna *sna,
2714			 const struct sna_composite_spans_op *op,
2715			 const BoxRec *box,
2716			 float opacity)
2717{
2718	union {
2719		struct sna_coordinate p;
2720		float f;
2721	} dst;
2722	float *v;
2723
2724	assert(op->base.floats_per_rect == 9);
2725	assert((sna->render.vertex_used % 3) == 0);
2726	v = sna->render.vertices + sna->render.vertex_used;
2727	sna->render.vertex_used += 9;
2728
2729	dst.p.x = box->x2;
2730	dst.p.y = box->y2;
2731	v[0] = dst.f;
2732	dst.p.x = box->x1;
2733	v[3] = dst.f;
2734	dst.p.y = box->y1;
2735	v[6] = dst.f;
2736
2737	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
2738	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
2739	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
2740
2741	v[8] = v[5] = v[2] = opacity;
2742}
2743
2744sse4_2 fastcall static void
2745emit_span_boxes_linear__sse4_2(const struct sna_composite_spans_op *op,
2746			       const struct sna_opacity_box *b, int nbox,
2747			       float *v)
2748{
2749	do {
2750		union {
2751			struct sna_coordinate p;
2752			float f;
2753		} dst;
2754
2755		dst.p.x = b->box.x2;
2756		dst.p.y = b->box.y2;
2757		v[0] = dst.f;
2758		dst.p.x = b->box.x1;
2759		v[3] = dst.f;
2760		dst.p.y = b->box.y1;
2761		v[6] = dst.f;
2762
2763		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
2764		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
2765		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
2766
2767		v[8] = v[5] = v[2] = b->alpha;
2768
2769		v += 9;
2770		b++;
2771	} while (--nbox);
2772}
2773
2774#endif
2775
2776/* AVX2 */
2777#if defined(avx2)
2778
2779avx2 fastcall static void
2780emit_span_identity__avx2(struct sna *sna,
2781			 const struct sna_composite_spans_op *op,
2782			 const BoxRec *box,
2783			 float opacity)
2784{
2785	float *v;
2786	union {
2787		struct sna_coordinate p;
2788		float f;
2789	} dst;
2790
2791	float sx = op->base.src.scale[0];
2792	float sy = op->base.src.scale[1];
2793	int16_t tx = op->base.src.offset[0];
2794	int16_t ty = op->base.src.offset[1];
2795
2796	assert(op->base.floats_per_rect == 12);
2797	assert((sna->render.vertex_used % 4) == 0);
2798	v = sna->render.vertices + sna->render.vertex_used;
2799	sna->render.vertex_used += 3*4;
2800	assert(sna->render.vertex_used <= sna->render.vertex_size);
2801
2802	dst.p.x = box->x2;
2803	dst.p.y = box->y2;
2804	v[0] = dst.f;
2805	v[1] = (box->x2 + tx) * sx;
2806	v[6] = v[2] = (box->y2 + ty) * sy;
2807
2808	dst.p.x = box->x1;
2809	v[4] = dst.f;
2810	v[9] = v[5] = (box->x1 + tx) * sx;
2811
2812	dst.p.y = box->y1;
2813	v[8] = dst.f;
2814	v[10] = (box->y1 + ty) * sy;
2815
2816	v[11] = v[7] = v[3] = opacity;
2817}
2818
2819avx2 fastcall static void
2820emit_span_boxes_identity__avx2(const struct sna_composite_spans_op *op,
2821			       const struct sna_opacity_box *b, int nbox,
2822			       float *v)
2823{
2824	do {
2825		union {
2826			struct sna_coordinate p;
2827			float f;
2828		} dst;
2829
2830		float sx = op->base.src.scale[0];
2831		float sy = op->base.src.scale[1];
2832		int16_t tx = op->base.src.offset[0];
2833		int16_t ty = op->base.src.offset[1];
2834
2835		dst.p.x = b->box.x2;
2836		dst.p.y = b->box.y2;
2837		v[0] = dst.f;
2838		v[1] = (b->box.x2 + tx) * sx;
2839		v[6] = v[2] = (b->box.y2 + ty) * sy;
2840
2841		dst.p.x = b->box.x1;
2842		v[4] = dst.f;
2843		v[9] = v[5] = (b->box.x1 + tx) * sx;
2844
2845		dst.p.y = b->box.y1;
2846		v[8] = dst.f;
2847		v[10] = (b->box.y1 + ty) * sy;
2848
2849		v[11] = v[7] = v[3] = b->alpha;
2850
2851		v += 12;
2852		b++;
2853	} while (--nbox);
2854}
2855
2856avx2 fastcall static void
2857emit_span_simple__avx2(struct sna *sna,
2858		       const struct sna_composite_spans_op *op,
2859		       const BoxRec *box,
2860		       float opacity)
2861{
2862	float *v;
2863	union {
2864		struct sna_coordinate p;
2865		float f;
2866	} dst;
2867
2868	float xx = op->base.src.transform->matrix[0][0];
2869	float x0 = op->base.src.transform->matrix[0][2];
2870	float yy = op->base.src.transform->matrix[1][1];
2871	float y0 = op->base.src.transform->matrix[1][2];
2872	float sx = op->base.src.scale[0];
2873	float sy = op->base.src.scale[1];
2874	int16_t tx = op->base.src.offset[0];
2875	int16_t ty = op->base.src.offset[1];
2876
2877	assert(op->base.floats_per_rect == 12);
2878	assert((sna->render.vertex_used % 4) == 0);
2879	v = sna->render.vertices + sna->render.vertex_used;
2880	sna->render.vertex_used += 3*4;
2881	assert(sna->render.vertex_used <= sna->render.vertex_size);
2882
2883	dst.p.x = box->x2;
2884	dst.p.y = box->y2;
2885	v[0] = dst.f;
2886	v[1] = ((box->x2 + tx) * xx + x0) * sx;
2887	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
2888
2889	dst.p.x = box->x1;
2890	v[4] = dst.f;
2891	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
2892
2893	dst.p.y = box->y1;
2894	v[8] = dst.f;
2895	v[10] = ((box->y1 + ty) * yy + y0) * sy;
2896
2897	v[11] = v[7] = v[3] = opacity;
2898}
2899
2900avx2 fastcall static void
2901emit_span_boxes_simple__avx2(const struct sna_composite_spans_op *op,
2902			     const struct sna_opacity_box *b, int nbox,
2903			     float *v)
2904{
2905	float xx = op->base.src.transform->matrix[0][0];
2906	float x0 = op->base.src.transform->matrix[0][2];
2907	float yy = op->base.src.transform->matrix[1][1];
2908	float y0 = op->base.src.transform->matrix[1][2];
2909	float sx = op->base.src.scale[0];
2910	float sy = op->base.src.scale[1];
2911	int16_t tx = op->base.src.offset[0];
2912	int16_t ty = op->base.src.offset[1];
2913
2914	do {
2915		union {
2916			struct sna_coordinate p;
2917			float f;
2918		} dst;
2919
2920		dst.p.x = b->box.x2;
2921		dst.p.y = b->box.y2;
2922		v[0] = dst.f;
2923		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
2924		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
2925
2926		dst.p.x = b->box.x1;
2927		v[4] = dst.f;
2928		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
2929
2930		dst.p.y = b->box.y1;
2931		v[8] = dst.f;
2932		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
2933
2934		v[11] = v[7] = v[3] = b->alpha;
2935
2936		v += 12;
2937		b++;
2938	} while (--nbox);
2939}
2940
2941avx2 fastcall static void
2942emit_span_affine__avx2(struct sna *sna,
2943		       const struct sna_composite_spans_op *op,
2944		       const BoxRec *box,
2945		       float opacity)
2946{
2947	union {
2948		struct sna_coordinate p;
2949		float f;
2950	} dst;
2951	float *v;
2952
2953	assert(op->base.floats_per_rect == 12);
2954	assert((sna->render.vertex_used % 4) == 0);
2955	v = sna->render.vertices + sna->render.vertex_used;
2956	sna->render.vertex_used += 12;
2957
2958	dst.p.x = box->x2;
2959	dst.p.y = box->y2;
2960	v[0] = dst.f;
2961	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
2962				    op->base.src.offset[1] + box->y2,
2963				    op->base.src.transform,
2964				    op->base.src.scale,
2965				    &v[1], &v[2]);
2966
2967	dst.p.x = box->x1;
2968	v[4] = dst.f;
2969	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2970				    op->base.src.offset[1] + box->y2,
2971				    op->base.src.transform,
2972				    op->base.src.scale,
2973				    &v[5], &v[6]);
2974
2975	dst.p.y = box->y1;
2976	v[8] = dst.f;
2977	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2978				    op->base.src.offset[1] + box->y1,
2979				    op->base.src.transform,
2980				    op->base.src.scale,
2981				    &v[9], &v[10]);
2982
2983	v[11] = v[7] = v[3] = opacity;
2984}
2985
2986avx2 fastcall static void
2987emit_span_boxes_affine__avx2(const struct sna_composite_spans_op *op,
2988			     const struct sna_opacity_box *b, int nbox,
2989			     float *v)
2990{
2991	do {
2992		union {
2993			struct sna_coordinate p;
2994			float f;
2995		} dst;
2996
2997		dst.p.x = b->box.x2;
2998		dst.p.y = b->box.y2;
2999		v[0] = dst.f;
3000		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
3001					    op->base.src.offset[1] + b->box.y2,
3002					    op->base.src.transform,
3003					    op->base.src.scale,
3004					    &v[1], &v[2]);
3005
3006		dst.p.x = b->box.x1;
3007		v[4] = dst.f;
3008		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
3009					    op->base.src.offset[1] + b->box.y2,
3010					    op->base.src.transform,
3011					    op->base.src.scale,
3012					    &v[5], &v[6]);
3013
3014		dst.p.y = b->box.y1;
3015		v[8] = dst.f;
3016		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
3017					    op->base.src.offset[1] + b->box.y1,
3018					    op->base.src.transform,
3019					    op->base.src.scale,
3020					    &v[9], &v[10]);
3021
3022		v[11] = v[7] = v[3] = b->alpha;
3023
3024		v += 12;
3025		b++;
3026	} while (--nbox);
3027}
3028
3029avx2 fastcall static void
3030emit_span_linear__avx2(struct sna *sna,
3031		       const struct sna_composite_spans_op *op,
3032		       const BoxRec *box,
3033		       float opacity)
3034{
3035	union {
3036		struct sna_coordinate p;
3037		float f;
3038	} dst;
3039	float *v;
3040
3041	assert(op->base.floats_per_rect == 9);
3042	assert((sna->render.vertex_used % 3) == 0);
3043	v = sna->render.vertices + sna->render.vertex_used;
3044	sna->render.vertex_used += 9;
3045
3046	dst.p.x = box->x2;
3047	dst.p.y = box->y2;
3048	v[0] = dst.f;
3049	dst.p.x = box->x1;
3050	v[3] = dst.f;
3051	dst.p.y = box->y1;
3052	v[6] = dst.f;
3053
3054	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
3055	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
3056	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
3057
3058	v[8] = v[5] = v[2] = opacity;
3059}
3060
3061avx2 fastcall static void
3062emit_span_boxes_linear__avx2(const struct sna_composite_spans_op *op,
3063			     const struct sna_opacity_box *b, int nbox,
3064			     float *v)
3065{
3066	do {
3067		union {
3068			struct sna_coordinate p;
3069			float f;
3070		} dst;
3071
3072		dst.p.x = b->box.x2;
3073		dst.p.y = b->box.y2;
3074		v[0] = dst.f;
3075		dst.p.x = b->box.x1;
3076		v[3] = dst.f;
3077		dst.p.y = b->box.y1;
3078		v[6] = dst.f;
3079
3080		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
3081		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
3082		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
3083
3084		v[8] = v[5] = v[2] = b->alpha;
3085
3086		v += 9;
3087		b++;
3088	} while (--nbox);
3089}
3090#endif
3091
3092unsigned gen4_choose_spans_emitter(struct sna *sna,
3093				   struct sna_composite_spans_op *tmp)
3094{
3095	unsigned vb;
3096
3097	if (tmp->base.src.is_solid) {
3098		DBG(("%s: solid source\n", __FUNCTION__));
3099		tmp->prim_emit = emit_span_solid;
3100		tmp->emit_boxes = emit_span_boxes_solid;
3101		tmp->base.floats_per_vertex = 3;
3102		vb = 1 << 2 | 1;
3103	} else if (tmp->base.src.is_linear) {
3104		DBG(("%s: linear source\n", __FUNCTION__));
3105#if defined(avx2)
3106		if (sna->cpu_features & AVX2) {
3107			tmp->prim_emit = emit_span_linear__avx2;
3108			tmp->emit_boxes = emit_span_boxes_linear__avx2;
3109		} else
3110#endif
3111#if defined(sse4_2)
3112		if (sna->cpu_features & SSE4_2) {
3113			tmp->prim_emit = emit_span_linear__sse4_2;
3114			tmp->emit_boxes = emit_span_boxes_linear__sse4_2;
3115		} else
3116#endif
3117		{
3118			tmp->prim_emit = emit_span_linear;
3119			tmp->emit_boxes = emit_span_boxes_linear;
3120		}
3121		tmp->base.floats_per_vertex = 3;
3122		vb = 1 << 2 | 1;
3123	} else if (tmp->base.src.transform == NULL) {
3124		DBG(("%s: identity transform\n", __FUNCTION__));
3125#if defined(avx2)
3126		if (sna->cpu_features & AVX2) {
3127			tmp->prim_emit = emit_span_identity__avx2;
3128			tmp->emit_boxes = emit_span_boxes_identity__avx2;
3129		} else
3130#endif
3131#if defined(sse4_2)
3132		if (sna->cpu_features & SSE4_2) {
3133			tmp->prim_emit = emit_span_identity__sse4_2;
3134			tmp->emit_boxes = emit_span_boxes_identity__sse4_2;
3135		} else
3136#endif
3137		{
3138			tmp->prim_emit = emit_span_identity;
3139			tmp->emit_boxes = emit_span_boxes_identity;
3140		}
3141		tmp->base.floats_per_vertex = 4;
3142		vb = 1 << 2 | 2;
3143	} else if (tmp->base.is_affine) {
3144		tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
3145		tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
3146		if (!sna_affine_transform_is_rotation(tmp->base.src.transform)) {
3147			DBG(("%s: simple (unrotated affine) transform\n", __FUNCTION__));
3148#if defined(avx2)
3149			if (sna->cpu_features & AVX2) {
3150				tmp->prim_emit = emit_span_simple__avx2;
3151				tmp->emit_boxes = emit_span_boxes_simple__avx2;
3152			} else
3153#endif
3154#if defined(sse4_2)
3155			if (sna->cpu_features & SSE4_2) {
3156				tmp->prim_emit = emit_span_simple__sse4_2;
3157				tmp->emit_boxes = emit_span_boxes_simple__sse4_2;
3158			} else
3159#endif
3160			{
3161				tmp->prim_emit = emit_span_simple;
3162				tmp->emit_boxes = emit_span_boxes_simple;
3163			}
3164		} else {
3165			DBG(("%s: affine transform\n", __FUNCTION__));
3166#if defined(avx2)
3167			if (sna->cpu_features & AVX2) {
3168				tmp->prim_emit = emit_span_affine__avx2;
3169				tmp->emit_boxes = emit_span_boxes_affine__avx2;
3170			} else
3171#endif
3172#if defined(sse4_2)
3173			if (sna->cpu_features & SSE4_2) {
3174				tmp->prim_emit = emit_span_affine__sse4_2;
3175				tmp->emit_boxes = emit_span_boxes_affine__sse4_2;
3176			} else
3177#endif
3178			{
3179				tmp->prim_emit = emit_span_affine;
3180				tmp->emit_boxes = emit_span_boxes_affine;
3181			}
3182		}
3183		tmp->base.floats_per_vertex = 4;
3184		vb = 1 << 2 | 2;
3185	} else {
3186		DBG(("%s: projective transform\n", __FUNCTION__));
3187		tmp->prim_emit = emit_span_primitive;
3188		tmp->emit_boxes = emit_span_boxes;
3189		tmp->base.floats_per_vertex = 5;
3190		vb = 1 << 2 | 3;
3191	}
3192	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
3193	return vb;
3194}
3195