1428d7b3dSmrg/*
2428d7b3dSmrg * Copyright © 2012 Intel Corporation
3428d7b3dSmrg *
4428d7b3dSmrg * Permission is hereby granted, free of charge, to any person obtaining a
5428d7b3dSmrg * copy of this software and associated documentation files (the "Software"),
6428d7b3dSmrg * to deal in the Software without restriction, including without limitation
7428d7b3dSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8428d7b3dSmrg * and/or sell copies of the Software, and to permit persons to whom the
9428d7b3dSmrg * Software is furnished to do so, subject to the following conditions:
10428d7b3dSmrg *
11428d7b3dSmrg * The above copyright notice and this permission notice (including the next
12428d7b3dSmrg * paragraph) shall be included in all copies or substantial portions of the
13428d7b3dSmrg * Software.
14428d7b3dSmrg *
15428d7b3dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16428d7b3dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17428d7b3dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18428d7b3dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19428d7b3dSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20428d7b3dSmrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21428d7b3dSmrg * SOFTWARE.
22428d7b3dSmrg *
23428d7b3dSmrg * Authors:
24428d7b3dSmrg *    Chris Wilson <chris@chris-wilson.co.uk>
25428d7b3dSmrg *
26428d7b3dSmrg */
27428d7b3dSmrg
28428d7b3dSmrg#ifdef HAVE_CONFIG_H
29428d7b3dSmrg#include "config.h"
30428d7b3dSmrg#endif
31428d7b3dSmrg
32428d7b3dSmrg#include "sna.h"
33428d7b3dSmrg#include "sna_render.h"
34428d7b3dSmrg#include "sna_render_inline.h"
35428d7b3dSmrg#include "gen4_vertex.h"
36428d7b3dSmrg
37428d7b3dSmrg#ifndef sse2
38428d7b3dSmrg#define sse2
39428d7b3dSmrg#endif
40428d7b3dSmrg
41428d7b3dSmrgvoid gen4_vertex_align(struct sna *sna, const struct sna_composite_op *op)
42428d7b3dSmrg{
43428d7b3dSmrg	int vertex_index;
44428d7b3dSmrg
45428d7b3dSmrg	assert(op->floats_per_vertex);
46428d7b3dSmrg	assert(op->floats_per_rect == 3*op->floats_per_vertex);
47428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
48428d7b3dSmrg
49428d7b3dSmrg	vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
50428d7b3dSmrg	if ((int)sna->render.vertex_size - vertex_index * op->floats_per_vertex < 2*op->floats_per_rect) {
51428d7b3dSmrg		DBG(("%s: flushing vertex buffer: new index=%d, max=%d\n",
52428d7b3dSmrg		     __FUNCTION__, vertex_index, sna->render.vertex_size / op->floats_per_vertex));
53428d7b3dSmrg		if (gen4_vertex_finish(sna) < 2*op->floats_per_rect) {
54428d7b3dSmrg			kgem_submit(&sna->kgem);
55428d7b3dSmrg			_kgem_set_mode(&sna->kgem, KGEM_RENDER);
56428d7b3dSmrg		}
57428d7b3dSmrg		assert(sna->render.vertex_used < sna->render.vertex_size);
58428d7b3dSmrg
59428d7b3dSmrg		vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
60428d7b3dSmrg		assert(vertex_index * op->floats_per_vertex <= sna->render.vertex_size);
61428d7b3dSmrg	}
62428d7b3dSmrg
63428d7b3dSmrg	sna->render.vertex_index = vertex_index;
64428d7b3dSmrg	sna->render.vertex_used = vertex_index * op->floats_per_vertex;
65428d7b3dSmrg}
66428d7b3dSmrg
67428d7b3dSmrgvoid gen4_vertex_flush(struct sna *sna)
68428d7b3dSmrg{
69428d7b3dSmrg	DBG(("%s[%x] = %d\n", __FUNCTION__,
70428d7b3dSmrg	     4*sna->render.vertex_offset,
71428d7b3dSmrg	     sna->render.vertex_index - sna->render.vertex_start));
72428d7b3dSmrg
73428d7b3dSmrg	assert(sna->render.vertex_offset);
74428d7b3dSmrg	assert(sna->render.vertex_offset <= sna->kgem.nbatch);
75428d7b3dSmrg	assert(sna->render.vertex_index > sna->render.vertex_start);
76428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
77428d7b3dSmrg
78428d7b3dSmrg	sna->kgem.batch[sna->render.vertex_offset] =
79428d7b3dSmrg		sna->render.vertex_index - sna->render.vertex_start;
80428d7b3dSmrg	sna->render.vertex_offset = 0;
81428d7b3dSmrg}
82428d7b3dSmrg
83428d7b3dSmrgint gen4_vertex_finish(struct sna *sna)
84428d7b3dSmrg{
85428d7b3dSmrg	struct kgem_bo *bo;
86428d7b3dSmrg	unsigned int i;
87428d7b3dSmrg	unsigned hint, size;
88428d7b3dSmrg
89428d7b3dSmrg	DBG(("%s: used=%d / %d\n", __FUNCTION__,
90428d7b3dSmrg	     sna->render.vertex_used, sna->render.vertex_size));
91428d7b3dSmrg	assert(sna->render.vertex_offset == 0);
92428d7b3dSmrg	assert(sna->render.vertex_used);
93428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
94428d7b3dSmrg
95428d7b3dSmrg	sna_vertex_wait__locked(&sna->render);
96428d7b3dSmrg
97428d7b3dSmrg	/* Note: we only need dword alignment (currently) */
98428d7b3dSmrg
99428d7b3dSmrg	hint = CREATE_GTT_MAP;
100428d7b3dSmrg
101428d7b3dSmrg	bo = sna->render.vbo;
102428d7b3dSmrg	if (bo) {
103428d7b3dSmrg		for (i = 0; i < sna->render.nvertex_reloc; i++) {
104428d7b3dSmrg			DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
105428d7b3dSmrg			     i, sna->render.vertex_reloc[i]));
106428d7b3dSmrg
107428d7b3dSmrg			sna->kgem.batch[sna->render.vertex_reloc[i]] =
108428d7b3dSmrg				kgem_add_reloc(&sna->kgem,
109428d7b3dSmrg					       sna->render.vertex_reloc[i], bo,
110428d7b3dSmrg					       I915_GEM_DOMAIN_VERTEX << 16,
111428d7b3dSmrg					       0);
112428d7b3dSmrg		}
113428d7b3dSmrg
114428d7b3dSmrg		assert(!sna->render.active);
115428d7b3dSmrg		sna->render.nvertex_reloc = 0;
116428d7b3dSmrg		sna->render.vertex_used = 0;
117428d7b3dSmrg		sna->render.vertex_index = 0;
118428d7b3dSmrg		sna->render.vbo = NULL;
119428d7b3dSmrg		sna->render.vb_id = 0;
120428d7b3dSmrg
121428d7b3dSmrg		kgem_bo_destroy(&sna->kgem, bo);
122428d7b3dSmrg		hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
123428d7b3dSmrg	} else {
124428d7b3dSmrg		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
125428d7b3dSmrg		assert(sna->render.vertices == sna->render.vertex_data);
126428d7b3dSmrg		if (kgem_is_idle(&sna->kgem))
127428d7b3dSmrg			return 0;
128428d7b3dSmrg	}
129428d7b3dSmrg
130428d7b3dSmrg	size = 256*1024;
131428d7b3dSmrg	assert(!sna->render.active);
132428d7b3dSmrg	sna->render.vertices = NULL;
133428d7b3dSmrg	sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
134428d7b3dSmrg	while (sna->render.vbo == NULL && size > sizeof(sna->render.vertex_data)) {
135428d7b3dSmrg		size /= 2;
136428d7b3dSmrg		sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
137428d7b3dSmrg	}
138428d7b3dSmrg	if (sna->render.vbo == NULL)
139428d7b3dSmrg		sna->render.vbo = kgem_create_linear(&sna->kgem,
140428d7b3dSmrg						     256*1024, CREATE_GTT_MAP);
141428d7b3dSmrg	if (sna->render.vbo &&
142428d7b3dSmrg	    kgem_check_bo(&sna->kgem, sna->render.vbo, NULL))
143428d7b3dSmrg		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
144428d7b3dSmrg	if (sna->render.vertices == NULL) {
145428d7b3dSmrg		if (sna->render.vbo) {
146428d7b3dSmrg			kgem_bo_destroy(&sna->kgem, sna->render.vbo);
147428d7b3dSmrg			sna->render.vbo = NULL;
148428d7b3dSmrg		}
149428d7b3dSmrg		sna->render.vertices = sna->render.vertex_data;
150428d7b3dSmrg		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
151428d7b3dSmrg		return 0;
152428d7b3dSmrg	}
153428d7b3dSmrg
154428d7b3dSmrg	if (sna->render.vertex_used) {
155428d7b3dSmrg		DBG(("%s: copying initial buffer x %d to handle=%d\n",
156428d7b3dSmrg		     __FUNCTION__,
157428d7b3dSmrg		     sna->render.vertex_used,
158428d7b3dSmrg		     sna->render.vbo->handle));
159428d7b3dSmrg		assert(sizeof(float)*sna->render.vertex_used <=
160428d7b3dSmrg		       __kgem_bo_size(sna->render.vbo));
161428d7b3dSmrg		memcpy(sna->render.vertices,
162428d7b3dSmrg		       sna->render.vertex_data,
163428d7b3dSmrg		       sizeof(float)*sna->render.vertex_used);
164428d7b3dSmrg	}
165428d7b3dSmrg
166428d7b3dSmrg	size = __kgem_bo_size(sna->render.vbo)/4;
167428d7b3dSmrg	if (size >= UINT16_MAX)
168428d7b3dSmrg		size = UINT16_MAX - 1;
169428d7b3dSmrg
170428d7b3dSmrg	DBG(("%s: create vbo handle=%d, size=%d floats [%d bytes]\n",
171428d7b3dSmrg	     __FUNCTION__, sna->render.vbo->handle, size, __kgem_bo_size(sna->render.vbo)));
172428d7b3dSmrg	assert(size > sna->render.vertex_used);
173428d7b3dSmrg
174428d7b3dSmrg	sna->render.vertex_size = size;
175428d7b3dSmrg	return size - sna->render.vertex_used;
176428d7b3dSmrg}
177428d7b3dSmrg
178428d7b3dSmrgvoid gen4_vertex_close(struct sna *sna)
179428d7b3dSmrg{
180428d7b3dSmrg	struct kgem_bo *bo, *free_bo = NULL;
181428d7b3dSmrg	unsigned int i, delta = 0;
182428d7b3dSmrg
183428d7b3dSmrg	assert(sna->render.vertex_offset == 0);
184428d7b3dSmrg	if (!sna->render.vb_id)
185428d7b3dSmrg		return;
186428d7b3dSmrg
187428d7b3dSmrg	DBG(("%s: used=%d, vbo active? %d, vb=%x, nreloc=%d\n",
188428d7b3dSmrg	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo ? sna->render.vbo->handle : 0,
189428d7b3dSmrg	     sna->render.vb_id, sna->render.nvertex_reloc));
190428d7b3dSmrg
191428d7b3dSmrg	assert(!sna->render.active);
192428d7b3dSmrg
193428d7b3dSmrg	bo = sna->render.vbo;
194428d7b3dSmrg	if (bo) {
195428d7b3dSmrg		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
196428d7b3dSmrg			DBG(("%s: discarding vbo (full), handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
197428d7b3dSmrg			sna->render.vbo = NULL;
198428d7b3dSmrg			sna->render.vertices = sna->render.vertex_data;
199428d7b3dSmrg			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
200428d7b3dSmrg			free_bo = bo;
201428d7b3dSmrg		} else if (!sna->kgem.has_llc && sna->render.vertices == MAP(bo->map__cpu)) {
202428d7b3dSmrg			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
203428d7b3dSmrg			sna->render.vertices =
204428d7b3dSmrg				kgem_bo_map__gtt(&sna->kgem, sna->render.vbo);
205428d7b3dSmrg			if (sna->render.vertices == NULL) {
206428d7b3dSmrg				sna->render.vbo = NULL;
207428d7b3dSmrg				sna->render.vertices = sna->render.vertex_data;
208428d7b3dSmrg				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
209428d7b3dSmrg				free_bo = bo;
210428d7b3dSmrg			}
211428d7b3dSmrg
212428d7b3dSmrg		}
213428d7b3dSmrg	} else {
214428d7b3dSmrg		int size;
215428d7b3dSmrg
216428d7b3dSmrg		size  = sna->kgem.nbatch;
217428d7b3dSmrg		size += sna->kgem.batch_size - sna->kgem.surface;
218428d7b3dSmrg		size += sna->render.vertex_used;
219428d7b3dSmrg
220428d7b3dSmrg		if (size <= 1024) {
221428d7b3dSmrg			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
222428d7b3dSmrg			     sna->render.vertex_used, sna->kgem.nbatch));
223428d7b3dSmrg			assert(sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface);
224428d7b3dSmrg			memcpy(sna->kgem.batch + sna->kgem.nbatch,
225428d7b3dSmrg			       sna->render.vertex_data,
226428d7b3dSmrg			       sna->render.vertex_used * 4);
227428d7b3dSmrg			delta = sna->kgem.nbatch * 4;
228428d7b3dSmrg			bo = NULL;
229428d7b3dSmrg			sna->kgem.nbatch += sna->render.vertex_used;
230428d7b3dSmrg		} else {
231428d7b3dSmrg			size = 256 * 1024;
232428d7b3dSmrg			do {
233428d7b3dSmrg				bo = kgem_create_linear(&sna->kgem, size,
234428d7b3dSmrg							CREATE_GTT_MAP | CREATE_NO_RETIRE | CREATE_NO_THROTTLE | CREATE_CACHED);
235428d7b3dSmrg			} while (bo == NULL && (size>>=1) > sizeof(float)*sna->render.vertex_used);
236428d7b3dSmrg
237428d7b3dSmrg			sna->render.vertices = NULL;
238428d7b3dSmrg			if (bo)
239428d7b3dSmrg				sna->render.vertices = kgem_bo_map(&sna->kgem, bo);
240428d7b3dSmrg			if (sna->render.vertices != NULL) {
241428d7b3dSmrg				DBG(("%s: new vbo: %d / %d\n", __FUNCTION__,
242428d7b3dSmrg				     sna->render.vertex_used, __kgem_bo_size(bo)/4));
243428d7b3dSmrg
244428d7b3dSmrg				assert(sizeof(float)*sna->render.vertex_used <= __kgem_bo_size(bo));
245428d7b3dSmrg				memcpy(sna->render.vertices,
246428d7b3dSmrg				       sna->render.vertex_data,
247428d7b3dSmrg				       sizeof(float)*sna->render.vertex_used);
248428d7b3dSmrg
249428d7b3dSmrg				size = __kgem_bo_size(bo)/4;
250428d7b3dSmrg				if (size >= UINT16_MAX)
251428d7b3dSmrg					size = UINT16_MAX - 1;
252428d7b3dSmrg
253428d7b3dSmrg				sna->render.vbo = bo;
254428d7b3dSmrg				sna->render.vertex_size = size;
255428d7b3dSmrg			} else {
256428d7b3dSmrg				DBG(("%s: tmp vbo: %d\n", __FUNCTION__,
257428d7b3dSmrg				     sna->render.vertex_used));
258428d7b3dSmrg
259428d7b3dSmrg				if (bo)
260428d7b3dSmrg					kgem_bo_destroy(&sna->kgem, bo);
261428d7b3dSmrg
262428d7b3dSmrg				bo = kgem_create_linear(&sna->kgem,
263428d7b3dSmrg							4*sna->render.vertex_used,
264428d7b3dSmrg							CREATE_NO_THROTTLE);
265428d7b3dSmrg				if (bo && !kgem_bo_write(&sna->kgem, bo,
266428d7b3dSmrg							 sna->render.vertex_data,
267428d7b3dSmrg							 4*sna->render.vertex_used)) {
268428d7b3dSmrg					kgem_bo_destroy(&sna->kgem, bo);
269428d7b3dSmrg					bo = NULL;
270428d7b3dSmrg				}
271428d7b3dSmrg
272428d7b3dSmrg				assert(sna->render.vbo == NULL);
273428d7b3dSmrg				sna->render.vertices = sna->render.vertex_data;
274428d7b3dSmrg				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
275428d7b3dSmrg				free_bo = bo;
276428d7b3dSmrg			}
277428d7b3dSmrg		}
278428d7b3dSmrg	}
279428d7b3dSmrg
280428d7b3dSmrg	assert(sna->render.nvertex_reloc);
281428d7b3dSmrg	for (i = 0; i < sna->render.nvertex_reloc; i++) {
282428d7b3dSmrg		DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
283428d7b3dSmrg		     i, sna->render.vertex_reloc[i]));
284428d7b3dSmrg
285428d7b3dSmrg		sna->kgem.batch[sna->render.vertex_reloc[i]] =
286428d7b3dSmrg			kgem_add_reloc(&sna->kgem,
287428d7b3dSmrg				       sna->render.vertex_reloc[i], bo,
288428d7b3dSmrg				       I915_GEM_DOMAIN_VERTEX << 16,
289428d7b3dSmrg				       delta);
290428d7b3dSmrg	}
291428d7b3dSmrg	sna->render.nvertex_reloc = 0;
292428d7b3dSmrg	sna->render.vb_id = 0;
293428d7b3dSmrg
294428d7b3dSmrg	if (sna->render.vbo == NULL) {
295428d7b3dSmrg		assert(!sna->render.active);
296428d7b3dSmrg		sna->render.vertex_used = 0;
297428d7b3dSmrg		sna->render.vertex_index = 0;
298428d7b3dSmrg		assert(sna->render.vertices == sna->render.vertex_data);
299428d7b3dSmrg		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
300428d7b3dSmrg	}
301428d7b3dSmrg
302428d7b3dSmrg	if (free_bo)
303428d7b3dSmrg		kgem_bo_destroy(&sna->kgem, free_bo);
304428d7b3dSmrg}
305428d7b3dSmrg
306428d7b3dSmrg/* specialised vertex emission routines */
307428d7b3dSmrg
308428d7b3dSmrg#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y) /* XXX assert(!too_large(x, y)); */
309428d7b3dSmrg#define OUT_VERTEX_F(v) vertex_emit(sna, v)
310428d7b3dSmrg
311428d7b3dSmrgforce_inline static float
312428d7b3dSmrgcompute_linear(const struct sna_composite_channel *channel,
313428d7b3dSmrg	       int16_t x, int16_t y)
314428d7b3dSmrg{
315428d7b3dSmrg	return ((x+channel->offset[0]) * channel->u.linear.dx +
316428d7b3dSmrg		(y+channel->offset[1]) * channel->u.linear.dy +
317428d7b3dSmrg		channel->u.linear.offset);
318428d7b3dSmrg}
319428d7b3dSmrg
320428d7b3dSmrgsse2 inline static void
321428d7b3dSmrgemit_texcoord(struct sna *sna,
322428d7b3dSmrg	      const struct sna_composite_channel *channel,
323428d7b3dSmrg	      int16_t x, int16_t y)
324428d7b3dSmrg{
325428d7b3dSmrg	if (channel->is_solid) {
326428d7b3dSmrg		OUT_VERTEX_F(0.5);
327428d7b3dSmrg		return;
328428d7b3dSmrg	}
329428d7b3dSmrg
330428d7b3dSmrg	x += channel->offset[0];
331428d7b3dSmrg	y += channel->offset[1];
332428d7b3dSmrg
333428d7b3dSmrg	if (channel->is_affine) {
334428d7b3dSmrg		float s, t;
335428d7b3dSmrg
336428d7b3dSmrg		sna_get_transformed_coordinates(x, y,
337428d7b3dSmrg						channel->transform,
338428d7b3dSmrg						&s, &t);
339428d7b3dSmrg		OUT_VERTEX_F(s * channel->scale[0]);
340428d7b3dSmrg		OUT_VERTEX_F(t * channel->scale[1]);
341428d7b3dSmrg	} else {
342428d7b3dSmrg		float s, t, w;
343428d7b3dSmrg
344428d7b3dSmrg		sna_get_transformed_coordinates_3d(x, y,
345428d7b3dSmrg						   channel->transform,
346428d7b3dSmrg						   &s, &t, &w);
347428d7b3dSmrg		OUT_VERTEX_F(s * channel->scale[0]);
348428d7b3dSmrg		OUT_VERTEX_F(t * channel->scale[1]);
349428d7b3dSmrg		OUT_VERTEX_F(w);
350428d7b3dSmrg	}
351428d7b3dSmrg}
352428d7b3dSmrg
353428d7b3dSmrgsse2 force_inline static void
354428d7b3dSmrgemit_vertex(struct sna *sna,
355428d7b3dSmrg	    const struct sna_composite_op *op,
356428d7b3dSmrg	    int16_t srcX, int16_t srcY,
357428d7b3dSmrg	    int16_t mskX, int16_t mskY,
358428d7b3dSmrg	    int16_t dstX, int16_t dstY)
359428d7b3dSmrg{
360428d7b3dSmrg	OUT_VERTEX(dstX, dstY);
361428d7b3dSmrg	emit_texcoord(sna, &op->src, srcX, srcY);
362428d7b3dSmrg}
363428d7b3dSmrg
364428d7b3dSmrgsse2 fastcall static void
365428d7b3dSmrgemit_primitive(struct sna *sna,
366428d7b3dSmrg	       const struct sna_composite_op *op,
367428d7b3dSmrg	       const struct sna_composite_rectangles *r)
368428d7b3dSmrg{
369428d7b3dSmrg	emit_vertex(sna, op,
370428d7b3dSmrg		    r->src.x + r->width,  r->src.y + r->height,
371428d7b3dSmrg		    r->mask.x + r->width, r->mask.y + r->height,
372428d7b3dSmrg		    r->dst.x + r->width, r->dst.y + r->height);
373428d7b3dSmrg	emit_vertex(sna, op,
374428d7b3dSmrg		    r->src.x,  r->src.y + r->height,
375428d7b3dSmrg		    r->mask.x, r->mask.y + r->height,
376428d7b3dSmrg		    r->dst.x,  r->dst.y + r->height);
377428d7b3dSmrg	emit_vertex(sna, op,
378428d7b3dSmrg		    r->src.x,  r->src.y,
379428d7b3dSmrg		    r->mask.x, r->mask.y,
380428d7b3dSmrg		    r->dst.x,  r->dst.y);
381428d7b3dSmrg}
382428d7b3dSmrg
383428d7b3dSmrgsse2 inline static float *
384428d7b3dSmrgvemit_texcoord(float *v,
385428d7b3dSmrg	      const struct sna_composite_channel *channel,
386428d7b3dSmrg	      int16_t x, int16_t y)
387428d7b3dSmrg{
388428d7b3dSmrg	if (channel->is_solid) {
389428d7b3dSmrg		*v++ = 0.5;
390428d7b3dSmrg	} else {
391428d7b3dSmrg		x += channel->offset[0];
392428d7b3dSmrg		y += channel->offset[1];
393428d7b3dSmrg
394428d7b3dSmrg		if (channel->is_affine) {
395428d7b3dSmrg			float s, t;
396428d7b3dSmrg
397428d7b3dSmrg			sna_get_transformed_coordinates(x, y,
398428d7b3dSmrg							channel->transform,
399428d7b3dSmrg							&s, &t);
400428d7b3dSmrg			*v++ = s * channel->scale[0];
401428d7b3dSmrg			*v++ = t * channel->scale[1];
402428d7b3dSmrg		} else {
403428d7b3dSmrg			float s, t, w;
404428d7b3dSmrg
405428d7b3dSmrg			sna_get_transformed_coordinates_3d(x, y,
406428d7b3dSmrg							   channel->transform,
407428d7b3dSmrg							   &s, &t, &w);
408428d7b3dSmrg			*v++ = s * channel->scale[0];
409428d7b3dSmrg			*v++ = t * channel->scale[1];
410428d7b3dSmrg			*v++ = w;
411428d7b3dSmrg		}
412428d7b3dSmrg	}
413428d7b3dSmrg
414428d7b3dSmrg	return v;
415428d7b3dSmrg}
416428d7b3dSmrg
417428d7b3dSmrgsse2 force_inline static float *
418428d7b3dSmrgvemit_vertex(float *v,
419428d7b3dSmrg	     const struct sna_composite_op *op,
420428d7b3dSmrg	     int16_t x, int16_t y)
421428d7b3dSmrg{
422428d7b3dSmrg	*v++ = pack_2s(x, y);
423428d7b3dSmrg	return vemit_texcoord(v, &op->src, x, y);
424428d7b3dSmrg}
425428d7b3dSmrg
426428d7b3dSmrgsse2 fastcall static void
427428d7b3dSmrgemit_boxes(const struct sna_composite_op *op,
428428d7b3dSmrg	   const BoxRec *box, int nbox,
429428d7b3dSmrg	   float *v)
430428d7b3dSmrg{
431428d7b3dSmrg	do {
432428d7b3dSmrg		v = vemit_vertex(v, op, box->x2, box->y2);
433428d7b3dSmrg		v = vemit_vertex(v, op, box->x1, box->y2);
434428d7b3dSmrg		v = vemit_vertex(v, op, box->x1, box->y1);
435428d7b3dSmrg
436428d7b3dSmrg		box++;
437428d7b3dSmrg	} while (--nbox);
438428d7b3dSmrg}
439428d7b3dSmrg
440428d7b3dSmrgsse2 force_inline static void
441428d7b3dSmrgemit_vertex_mask(struct sna *sna,
442428d7b3dSmrg		 const struct sna_composite_op *op,
443428d7b3dSmrg		 int16_t srcX, int16_t srcY,
444428d7b3dSmrg		 int16_t mskX, int16_t mskY,
445428d7b3dSmrg		 int16_t dstX, int16_t dstY)
446428d7b3dSmrg{
447428d7b3dSmrg	OUT_VERTEX(dstX, dstY);
448428d7b3dSmrg	emit_texcoord(sna, &op->src, srcX, srcY);
449428d7b3dSmrg	emit_texcoord(sna, &op->mask, mskX, mskY);
450428d7b3dSmrg}
451428d7b3dSmrg
452428d7b3dSmrgsse2 fastcall static void
453428d7b3dSmrgemit_primitive_mask(struct sna *sna,
454428d7b3dSmrg		    const struct sna_composite_op *op,
455428d7b3dSmrg		    const struct sna_composite_rectangles *r)
456428d7b3dSmrg{
457428d7b3dSmrg	emit_vertex_mask(sna, op,
458428d7b3dSmrg			 r->src.x + r->width,  r->src.y + r->height,
459428d7b3dSmrg			 r->mask.x + r->width, r->mask.y + r->height,
460428d7b3dSmrg			 r->dst.x + r->width, r->dst.y + r->height);
461428d7b3dSmrg	emit_vertex_mask(sna, op,
462428d7b3dSmrg			 r->src.x,  r->src.y + r->height,
463428d7b3dSmrg			 r->mask.x, r->mask.y + r->height,
464428d7b3dSmrg			 r->dst.x,  r->dst.y + r->height);
465428d7b3dSmrg	emit_vertex_mask(sna, op,
466428d7b3dSmrg			 r->src.x,  r->src.y,
467428d7b3dSmrg			 r->mask.x, r->mask.y,
468428d7b3dSmrg			 r->dst.x,  r->dst.y);
469428d7b3dSmrg}
470428d7b3dSmrg
471428d7b3dSmrgsse2 force_inline static float *
472428d7b3dSmrgvemit_vertex_mask(float *v,
473428d7b3dSmrg		  const struct sna_composite_op *op,
474428d7b3dSmrg		  int16_t x, int16_t y)
475428d7b3dSmrg{
476428d7b3dSmrg	*v++ = pack_2s(x, y);
477428d7b3dSmrg	v = vemit_texcoord(v, &op->src, x, y);
478428d7b3dSmrg	v = vemit_texcoord(v, &op->mask, x, y);
479428d7b3dSmrg	return v;
480428d7b3dSmrg}
481428d7b3dSmrg
482428d7b3dSmrgsse2 fastcall static void
483428d7b3dSmrgemit_boxes_mask(const struct sna_composite_op *op,
484428d7b3dSmrg		const BoxRec *box, int nbox,
485428d7b3dSmrg		float *v)
486428d7b3dSmrg{
487428d7b3dSmrg	do {
488428d7b3dSmrg		v = vemit_vertex_mask(v, op, box->x2, box->y2);
489428d7b3dSmrg		v = vemit_vertex_mask(v, op, box->x1, box->y2);
490428d7b3dSmrg		v = vemit_vertex_mask(v, op, box->x1, box->y1);
491428d7b3dSmrg
492428d7b3dSmrg		box++;
493428d7b3dSmrg	} while (--nbox);
494428d7b3dSmrg}
495428d7b3dSmrg
496428d7b3dSmrg
497428d7b3dSmrgsse2 fastcall static void
498428d7b3dSmrgemit_primitive_solid(struct sna *sna,
499428d7b3dSmrg		     const struct sna_composite_op *op,
500428d7b3dSmrg		     const struct sna_composite_rectangles *r)
501428d7b3dSmrg{
502428d7b3dSmrg	float *v;
503428d7b3dSmrg	union {
504428d7b3dSmrg		struct sna_coordinate p;
505428d7b3dSmrg		float f;
506428d7b3dSmrg	} dst;
507428d7b3dSmrg
508428d7b3dSmrg	assert(op->floats_per_rect == 6);
509428d7b3dSmrg	assert((sna->render.vertex_used % 2) == 0);
510428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
511428d7b3dSmrg	sna->render.vertex_used += 6;
512428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
513428d7b3dSmrg
514428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
515428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
516428d7b3dSmrg	v[0] = dst.f;
517428d7b3dSmrg	dst.p.x = r->dst.x;
518428d7b3dSmrg	v[2] = dst.f;
519428d7b3dSmrg	dst.p.y = r->dst.y;
520428d7b3dSmrg	v[4] = dst.f;
521428d7b3dSmrg
522428d7b3dSmrg	v[5] = v[3] = v[1] = .5;
523428d7b3dSmrg}
524428d7b3dSmrg
525428d7b3dSmrgsse2 fastcall static void
526428d7b3dSmrgemit_boxes_solid(const struct sna_composite_op *op,
527428d7b3dSmrg		 const BoxRec *box, int nbox,
528428d7b3dSmrg		 float *v)
529428d7b3dSmrg{
530428d7b3dSmrg	do {
531428d7b3dSmrg		union {
532428d7b3dSmrg			struct sna_coordinate p;
533428d7b3dSmrg			float f;
534428d7b3dSmrg		} dst;
535428d7b3dSmrg
536428d7b3dSmrg		dst.p.x = box->x2;
537428d7b3dSmrg		dst.p.y = box->y2;
538428d7b3dSmrg		v[0] = dst.f;
539428d7b3dSmrg		dst.p.x = box->x1;
540428d7b3dSmrg		v[2] = dst.f;
541428d7b3dSmrg		dst.p.y = box->y1;
542428d7b3dSmrg		v[4] = dst.f;
543428d7b3dSmrg
544428d7b3dSmrg		v[5] = v[3] = v[1] = .5;
545428d7b3dSmrg		box++;
546428d7b3dSmrg		v += 6;
547428d7b3dSmrg	} while (--nbox);
548428d7b3dSmrg}
549428d7b3dSmrg
550428d7b3dSmrgsse2 fastcall static void
551428d7b3dSmrgemit_primitive_linear(struct sna *sna,
552428d7b3dSmrg		      const struct sna_composite_op *op,
553428d7b3dSmrg		      const struct sna_composite_rectangles *r)
554428d7b3dSmrg{
555428d7b3dSmrg	float *v;
556428d7b3dSmrg	union {
557428d7b3dSmrg		struct sna_coordinate p;
558428d7b3dSmrg		float f;
559428d7b3dSmrg	} dst;
560428d7b3dSmrg
561428d7b3dSmrg	assert(op->floats_per_rect == 6);
562428d7b3dSmrg	assert((sna->render.vertex_used % 2) == 0);
563428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
564428d7b3dSmrg	sna->render.vertex_used += 6;
565428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
566428d7b3dSmrg
567428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
568428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
569428d7b3dSmrg	v[0] = dst.f;
570428d7b3dSmrg	dst.p.x = r->dst.x;
571428d7b3dSmrg	v[2] = dst.f;
572428d7b3dSmrg	dst.p.y = r->dst.y;
573428d7b3dSmrg	v[4] = dst.f;
574428d7b3dSmrg
575428d7b3dSmrg	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
576428d7b3dSmrg	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
577428d7b3dSmrg	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
578428d7b3dSmrg}
579428d7b3dSmrg
580428d7b3dSmrgsse2 fastcall static void
581428d7b3dSmrgemit_boxes_linear(const struct sna_composite_op *op,
582428d7b3dSmrg		  const BoxRec *box, int nbox,
583428d7b3dSmrg		  float *v)
584428d7b3dSmrg{
585428d7b3dSmrg	union {
586428d7b3dSmrg		struct sna_coordinate p;
587428d7b3dSmrg		float f;
588428d7b3dSmrg	} dst;
589428d7b3dSmrg
590428d7b3dSmrg	do {
591428d7b3dSmrg		dst.p.x = box->x2;
592428d7b3dSmrg		dst.p.y = box->y2;
593428d7b3dSmrg		v[0] = dst.f;
594428d7b3dSmrg		dst.p.x = box->x1;
595428d7b3dSmrg		v[2] = dst.f;
596428d7b3dSmrg		dst.p.y = box->y1;
597428d7b3dSmrg		v[4] = dst.f;
598428d7b3dSmrg
599428d7b3dSmrg		v[1] = compute_linear(&op->src, box->x2, box->y2);
600428d7b3dSmrg		v[3] = compute_linear(&op->src, box->x1, box->y2);
601428d7b3dSmrg		v[5] = compute_linear(&op->src, box->x1, box->y1);
602428d7b3dSmrg
603428d7b3dSmrg		v += 6;
604428d7b3dSmrg		box++;
605428d7b3dSmrg	} while (--nbox);
606428d7b3dSmrg}
607428d7b3dSmrg
608428d7b3dSmrgsse2 fastcall static void
609428d7b3dSmrgemit_primitive_identity_source(struct sna *sna,
610428d7b3dSmrg			       const struct sna_composite_op *op,
611428d7b3dSmrg			       const struct sna_composite_rectangles *r)
612428d7b3dSmrg{
613428d7b3dSmrg	union {
614428d7b3dSmrg		struct sna_coordinate p;
615428d7b3dSmrg		float f;
616428d7b3dSmrg	} dst;
617428d7b3dSmrg	float *v;
618428d7b3dSmrg
619428d7b3dSmrg	assert(op->floats_per_rect == 9);
620428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
621428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
622428d7b3dSmrg	sna->render.vertex_used += 9;
623428d7b3dSmrg
624428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
625428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
626428d7b3dSmrg	v[0] = dst.f;
627428d7b3dSmrg	dst.p.x = r->dst.x;
628428d7b3dSmrg	v[3] = dst.f;
629428d7b3dSmrg	dst.p.y = r->dst.y;
630428d7b3dSmrg	v[6] = dst.f;
631428d7b3dSmrg
632428d7b3dSmrg	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
633428d7b3dSmrg	v[1] = v[4] + r->width * op->src.scale[0];
634428d7b3dSmrg
635428d7b3dSmrg	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
636428d7b3dSmrg	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
637428d7b3dSmrg}
638428d7b3dSmrg
639428d7b3dSmrgsse2 fastcall static void
640428d7b3dSmrgemit_boxes_identity_source(const struct sna_composite_op *op,
641428d7b3dSmrg			   const BoxRec *box, int nbox,
642428d7b3dSmrg			   float *v)
643428d7b3dSmrg{
644428d7b3dSmrg	do {
645428d7b3dSmrg		union {
646428d7b3dSmrg			struct sna_coordinate p;
647428d7b3dSmrg			float f;
648428d7b3dSmrg		} dst;
649428d7b3dSmrg
650428d7b3dSmrg		dst.p.x = box->x2;
651428d7b3dSmrg		dst.p.y = box->y2;
652428d7b3dSmrg		v[0] = dst.f;
653428d7b3dSmrg		dst.p.x = box->x1;
654428d7b3dSmrg		v[3] = dst.f;
655428d7b3dSmrg		dst.p.y = box->y1;
656428d7b3dSmrg		v[6] = dst.f;
657428d7b3dSmrg
658428d7b3dSmrg		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
659428d7b3dSmrg		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
660428d7b3dSmrg
661428d7b3dSmrg		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
662428d7b3dSmrg		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
663428d7b3dSmrg
664428d7b3dSmrg		v += 9;
665428d7b3dSmrg		box++;
666428d7b3dSmrg	} while (--nbox);
667428d7b3dSmrg}
668428d7b3dSmrg
669428d7b3dSmrgsse2 fastcall static void
670428d7b3dSmrgemit_primitive_simple_source(struct sna *sna,
671428d7b3dSmrg			     const struct sna_composite_op *op,
672428d7b3dSmrg			     const struct sna_composite_rectangles *r)
673428d7b3dSmrg{
674428d7b3dSmrg	float *v;
675428d7b3dSmrg	union {
676428d7b3dSmrg		struct sna_coordinate p;
677428d7b3dSmrg		float f;
678428d7b3dSmrg	} dst;
679428d7b3dSmrg
680428d7b3dSmrg	float xx = op->src.transform->matrix[0][0];
681428d7b3dSmrg	float x0 = op->src.transform->matrix[0][2];
682428d7b3dSmrg	float yy = op->src.transform->matrix[1][1];
683428d7b3dSmrg	float y0 = op->src.transform->matrix[1][2];
684428d7b3dSmrg	float sx = op->src.scale[0];
685428d7b3dSmrg	float sy = op->src.scale[1];
686428d7b3dSmrg	int16_t tx = op->src.offset[0];
687428d7b3dSmrg	int16_t ty = op->src.offset[1];
688428d7b3dSmrg
689428d7b3dSmrg	assert(op->floats_per_rect == 9);
690428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
691428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
692428d7b3dSmrg	sna->render.vertex_used += 3*3;
693428d7b3dSmrg
694428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
695428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
696428d7b3dSmrg	v[0] = dst.f;
697428d7b3dSmrg	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
698428d7b3dSmrg	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
699428d7b3dSmrg
700428d7b3dSmrg	dst.p.x = r->dst.x;
701428d7b3dSmrg	v[3] = dst.f;
702428d7b3dSmrg	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
703428d7b3dSmrg
704428d7b3dSmrg	dst.p.y = r->dst.y;
705428d7b3dSmrg	v[6] = dst.f;
706428d7b3dSmrg	v[8] = ((r->src.y + ty) * yy + y0) * sy;
707428d7b3dSmrg}
708428d7b3dSmrg
709428d7b3dSmrgsse2 fastcall static void
710428d7b3dSmrgemit_boxes_simple_source(const struct sna_composite_op *op,
711428d7b3dSmrg			 const BoxRec *box, int nbox,
712428d7b3dSmrg			 float *v)
713428d7b3dSmrg{
714428d7b3dSmrg	float xx = op->src.transform->matrix[0][0];
715428d7b3dSmrg	float x0 = op->src.transform->matrix[0][2];
716428d7b3dSmrg	float yy = op->src.transform->matrix[1][1];
717428d7b3dSmrg	float y0 = op->src.transform->matrix[1][2];
718428d7b3dSmrg	float sx = op->src.scale[0];
719428d7b3dSmrg	float sy = op->src.scale[1];
720428d7b3dSmrg	int16_t tx = op->src.offset[0];
721428d7b3dSmrg	int16_t ty = op->src.offset[1];
722428d7b3dSmrg
723428d7b3dSmrg	do {
724428d7b3dSmrg		union {
725428d7b3dSmrg			struct sna_coordinate p;
726428d7b3dSmrg			float f;
727428d7b3dSmrg		} dst;
728428d7b3dSmrg
729428d7b3dSmrg		dst.p.x = box->x2;
730428d7b3dSmrg		dst.p.y = box->y2;
731428d7b3dSmrg		v[0] = dst.f;
732428d7b3dSmrg		v[1] = ((box->x2 + tx) * xx + x0) * sx;
733428d7b3dSmrg		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
734428d7b3dSmrg
735428d7b3dSmrg		dst.p.x = box->x1;
736428d7b3dSmrg		v[3] = dst.f;
737428d7b3dSmrg		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
738428d7b3dSmrg
739428d7b3dSmrg		dst.p.y = box->y1;
740428d7b3dSmrg		v[6] = dst.f;
741428d7b3dSmrg		v[8] = ((box->y1 + ty) * yy + y0) * sy;
742428d7b3dSmrg
743428d7b3dSmrg		v += 9;
744428d7b3dSmrg		box++;
745428d7b3dSmrg	} while (--nbox);
746428d7b3dSmrg}
747428d7b3dSmrg
748428d7b3dSmrgsse2 fastcall static void
749428d7b3dSmrgemit_primitive_affine_source(struct sna *sna,
750428d7b3dSmrg			     const struct sna_composite_op *op,
751428d7b3dSmrg			     const struct sna_composite_rectangles *r)
752428d7b3dSmrg{
753428d7b3dSmrg	union {
754428d7b3dSmrg		struct sna_coordinate p;
755428d7b3dSmrg		float f;
756428d7b3dSmrg	} dst;
757428d7b3dSmrg	float *v;
758428d7b3dSmrg
759428d7b3dSmrg	assert(op->floats_per_rect == 9);
760428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
761428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
762428d7b3dSmrg	sna->render.vertex_used += 9;
763428d7b3dSmrg
764428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
765428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
766428d7b3dSmrg	v[0] = dst.f;
767428d7b3dSmrg	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x + r->width,
768428d7b3dSmrg				    op->src.offset[1] + r->src.y + r->height,
769428d7b3dSmrg				    op->src.transform, op->src.scale,
770428d7b3dSmrg				    &v[1], &v[2]);
771428d7b3dSmrg
772428d7b3dSmrg	dst.p.x = r->dst.x;
773428d7b3dSmrg	v[3] = dst.f;
774428d7b3dSmrg	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
775428d7b3dSmrg				    op->src.offset[1] + r->src.y + r->height,
776428d7b3dSmrg				    op->src.transform, op->src.scale,
777428d7b3dSmrg				    &v[4], &v[5]);
778428d7b3dSmrg
779428d7b3dSmrg	dst.p.y = r->dst.y;
780428d7b3dSmrg	v[6] = dst.f;
781428d7b3dSmrg	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
782428d7b3dSmrg				    op->src.offset[1] + r->src.y,
783428d7b3dSmrg				    op->src.transform, op->src.scale,
784428d7b3dSmrg				    &v[7], &v[8]);
785428d7b3dSmrg}
786428d7b3dSmrg
787428d7b3dSmrgsse2 fastcall static void
788428d7b3dSmrgemit_boxes_affine_source(const struct sna_composite_op *op,
789428d7b3dSmrg			 const BoxRec *box, int nbox,
790428d7b3dSmrg			 float *v)
791428d7b3dSmrg{
792428d7b3dSmrg	do {
793428d7b3dSmrg		union {
794428d7b3dSmrg			struct sna_coordinate p;
795428d7b3dSmrg			float f;
796428d7b3dSmrg		} dst;
797428d7b3dSmrg
798428d7b3dSmrg		dst.p.x = box->x2;
799428d7b3dSmrg		dst.p.y = box->y2;
800428d7b3dSmrg		v[0] = dst.f;
801428d7b3dSmrg		_sna_get_transformed_scaled(op->src.offset[0] + box->x2,
802428d7b3dSmrg					    op->src.offset[1] + box->y2,
803428d7b3dSmrg					    op->src.transform, op->src.scale,
804428d7b3dSmrg					    &v[1], &v[2]);
805428d7b3dSmrg
806428d7b3dSmrg		dst.p.x = box->x1;
807428d7b3dSmrg		v[3] = dst.f;
808428d7b3dSmrg		_sna_get_transformed_scaled(op->src.offset[0] + box->x1,
809428d7b3dSmrg					    op->src.offset[1] + box->y2,
810428d7b3dSmrg					    op->src.transform, op->src.scale,
811428d7b3dSmrg					    &v[4], &v[5]);
812428d7b3dSmrg
813428d7b3dSmrg		dst.p.y = box->y1;
814428d7b3dSmrg		v[6] = dst.f;
815428d7b3dSmrg		_sna_get_transformed_scaled(op->src.offset[0] + box->x1,
816428d7b3dSmrg					    op->src.offset[1] + box->y1,
817428d7b3dSmrg					    op->src.transform, op->src.scale,
818428d7b3dSmrg					    &v[7], &v[8]);
819428d7b3dSmrg		box++;
820428d7b3dSmrg		v += 9;
821428d7b3dSmrg	} while (--nbox);
822428d7b3dSmrg}
823428d7b3dSmrg
824428d7b3dSmrgsse2 fastcall static void
825428d7b3dSmrgemit_primitive_identity_mask(struct sna *sna,
826428d7b3dSmrg			     const struct sna_composite_op *op,
827428d7b3dSmrg			     const struct sna_composite_rectangles *r)
828428d7b3dSmrg{
829428d7b3dSmrg	union {
830428d7b3dSmrg		struct sna_coordinate p;
831428d7b3dSmrg		float f;
832428d7b3dSmrg	} dst;
833428d7b3dSmrg	float msk_x, msk_y;
834428d7b3dSmrg	float w, h;
835428d7b3dSmrg	float *v;
836428d7b3dSmrg
837428d7b3dSmrg	msk_x = r->mask.x + op->mask.offset[0];
838428d7b3dSmrg	msk_y = r->mask.y + op->mask.offset[1];
839428d7b3dSmrg	w = r->width;
840428d7b3dSmrg	h = r->height;
841428d7b3dSmrg
842428d7b3dSmrg	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
843428d7b3dSmrg	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
844428d7b3dSmrg
845428d7b3dSmrg	assert(op->floats_per_rect == 12);
846428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
847428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
848428d7b3dSmrg	sna->render.vertex_used += 12;
849428d7b3dSmrg
850428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
851428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
852428d7b3dSmrg	v[0] = dst.f;
853428d7b3dSmrg	v[2] = (msk_x + w) * op->mask.scale[0];
854428d7b3dSmrg	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
855428d7b3dSmrg
856428d7b3dSmrg	dst.p.x = r->dst.x;
857428d7b3dSmrg	v[4] = dst.f;
858428d7b3dSmrg	v[10] = v[6] = msk_x * op->mask.scale[0];
859428d7b3dSmrg
860428d7b3dSmrg	dst.p.y = r->dst.y;
861428d7b3dSmrg	v[8] = dst.f;
862428d7b3dSmrg	v[11] = msk_y * op->mask.scale[1];
863428d7b3dSmrg
864428d7b3dSmrg	v[9] = v[5] = v[1] = .5;
865428d7b3dSmrg}
866428d7b3dSmrg
867428d7b3dSmrgsse2 fastcall static void
868428d7b3dSmrgemit_boxes_identity_mask(const struct sna_composite_op *op,
869428d7b3dSmrg			 const BoxRec *box, int nbox,
870428d7b3dSmrg			 float *v)
871428d7b3dSmrg{
872428d7b3dSmrg	float msk_x = op->mask.offset[0];
873428d7b3dSmrg	float msk_y = op->mask.offset[1];
874428d7b3dSmrg
875428d7b3dSmrg	do {
876428d7b3dSmrg		union {
877428d7b3dSmrg			struct sna_coordinate p;
878428d7b3dSmrg			float f;
879428d7b3dSmrg		} dst;
880428d7b3dSmrg
881428d7b3dSmrg		dst.p.x = box->x2;
882428d7b3dSmrg		dst.p.y = box->y2;
883428d7b3dSmrg		v[0] = dst.f;
884428d7b3dSmrg		v[2] = (msk_x + box->x2) * op->mask.scale[0];
885428d7b3dSmrg		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
886428d7b3dSmrg
887428d7b3dSmrg		dst.p.x = box->x1;
888428d7b3dSmrg		v[4] = dst.f;
889428d7b3dSmrg		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
890428d7b3dSmrg
891428d7b3dSmrg		dst.p.y = box->y1;
892428d7b3dSmrg		v[8] = dst.f;
893428d7b3dSmrg		v[11] = (msk_y + box->y1) * op->mask.scale[1];
894428d7b3dSmrg
895428d7b3dSmrg		v[9] = v[5] = v[1] = .5;
896428d7b3dSmrg		v += 12;
897428d7b3dSmrg		box++;
898428d7b3dSmrg	} while (--nbox);
899428d7b3dSmrg}
900428d7b3dSmrg
901428d7b3dSmrgsse2 fastcall static void
902428d7b3dSmrgemit_primitive_linear_identity_mask(struct sna *sna,
903428d7b3dSmrg				    const struct sna_composite_op *op,
904428d7b3dSmrg				    const struct sna_composite_rectangles *r)
905428d7b3dSmrg{
906428d7b3dSmrg	union {
907428d7b3dSmrg		struct sna_coordinate p;
908428d7b3dSmrg		float f;
909428d7b3dSmrg	} dst;
910428d7b3dSmrg	float msk_x, msk_y;
911428d7b3dSmrg	float w, h;
912428d7b3dSmrg	float *v;
913428d7b3dSmrg
914428d7b3dSmrg	msk_x = r->mask.x + op->mask.offset[0];
915428d7b3dSmrg	msk_y = r->mask.y + op->mask.offset[1];
916428d7b3dSmrg	w = r->width;
917428d7b3dSmrg	h = r->height;
918428d7b3dSmrg
919428d7b3dSmrg	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
920428d7b3dSmrg	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
921428d7b3dSmrg
922428d7b3dSmrg	assert(op->floats_per_rect == 12);
923428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
924428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
925428d7b3dSmrg	sna->render.vertex_used += 12;
926428d7b3dSmrg
927428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
928428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
929428d7b3dSmrg	v[0] = dst.f;
930428d7b3dSmrg	v[2] = (msk_x + w) * op->mask.scale[0];
931428d7b3dSmrg	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
932428d7b3dSmrg
933428d7b3dSmrg	dst.p.x = r->dst.x;
934428d7b3dSmrg	v[4] = dst.f;
935428d7b3dSmrg	v[10] = v[6] = msk_x * op->mask.scale[0];
936428d7b3dSmrg
937428d7b3dSmrg	dst.p.y = r->dst.y;
938428d7b3dSmrg	v[8] = dst.f;
939428d7b3dSmrg	v[11] = msk_y * op->mask.scale[1];
940428d7b3dSmrg
941428d7b3dSmrg	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
942428d7b3dSmrg	v[5] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
943428d7b3dSmrg	v[9] = compute_linear(&op->src, r->src.x, r->src.y);
944428d7b3dSmrg}
945428d7b3dSmrg
946428d7b3dSmrgsse2 fastcall static void
947428d7b3dSmrgemit_boxes_linear_identity_mask(const struct sna_composite_op *op,
948428d7b3dSmrg				const BoxRec *box, int nbox,
949428d7b3dSmrg				float *v)
950428d7b3dSmrg{
951428d7b3dSmrg	float msk_x = op->mask.offset[0];
952428d7b3dSmrg	float msk_y = op->mask.offset[1];
953428d7b3dSmrg
954428d7b3dSmrg	do {
955428d7b3dSmrg		union {
956428d7b3dSmrg			struct sna_coordinate p;
957428d7b3dSmrg			float f;
958428d7b3dSmrg		} dst;
959428d7b3dSmrg
960428d7b3dSmrg		dst.p.x = box->x2;
961428d7b3dSmrg		dst.p.y = box->y2;
962428d7b3dSmrg		v[0] = dst.f;
963428d7b3dSmrg		v[2] = (msk_x + box->x2) * op->mask.scale[0];
964428d7b3dSmrg		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
965428d7b3dSmrg
966428d7b3dSmrg		dst.p.x = box->x1;
967428d7b3dSmrg		v[4] = dst.f;
968428d7b3dSmrg		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
969428d7b3dSmrg
970428d7b3dSmrg		dst.p.y = box->y1;
971428d7b3dSmrg		v[8] = dst.f;
972428d7b3dSmrg		v[11] = (msk_y + box->y1) * op->mask.scale[1];
973428d7b3dSmrg
974428d7b3dSmrg		v[1] = compute_linear(&op->src, box->x2, box->y2);
975428d7b3dSmrg		v[5] = compute_linear(&op->src, box->x1, box->y2);
976428d7b3dSmrg		v[9] = compute_linear(&op->src, box->x1, box->y1);
977428d7b3dSmrg
978428d7b3dSmrg		v += 12;
979428d7b3dSmrg		box++;
980428d7b3dSmrg	} while (--nbox);
981428d7b3dSmrg}
982428d7b3dSmrg
983428d7b3dSmrgsse2 fastcall static void
984428d7b3dSmrgemit_primitive_identity_source_mask(struct sna *sna,
985428d7b3dSmrg				    const struct sna_composite_op *op,
986428d7b3dSmrg				    const struct sna_composite_rectangles *r)
987428d7b3dSmrg{
988428d7b3dSmrg	union {
989428d7b3dSmrg		struct sna_coordinate p;
990428d7b3dSmrg		float f;
991428d7b3dSmrg	} dst;
992428d7b3dSmrg	float src_x, src_y;
993428d7b3dSmrg	float msk_x, msk_y;
994428d7b3dSmrg	float w, h;
995428d7b3dSmrg	float *v;
996428d7b3dSmrg
997428d7b3dSmrg	src_x = r->src.x + op->src.offset[0];
998428d7b3dSmrg	src_y = r->src.y + op->src.offset[1];
999428d7b3dSmrg	msk_x = r->mask.x + op->mask.offset[0];
1000428d7b3dSmrg	msk_y = r->mask.y + op->mask.offset[1];
1001428d7b3dSmrg	w = r->width;
1002428d7b3dSmrg	h = r->height;
1003428d7b3dSmrg
1004428d7b3dSmrg	assert(op->floats_per_rect == 15);
1005428d7b3dSmrg	assert((sna->render.vertex_used % 5) == 0);
1006428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1007428d7b3dSmrg	sna->render.vertex_used += 15;
1008428d7b3dSmrg
1009428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1010428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1011428d7b3dSmrg	v[0] = dst.f;
1012428d7b3dSmrg	v[1] = (src_x + w) * op->src.scale[0];
1013428d7b3dSmrg	v[2] = (src_y + h) * op->src.scale[1];
1014428d7b3dSmrg	v[3] = (msk_x + w) * op->mask.scale[0];
1015428d7b3dSmrg	v[4] = (msk_y + h) * op->mask.scale[1];
1016428d7b3dSmrg
1017428d7b3dSmrg	dst.p.x = r->dst.x;
1018428d7b3dSmrg	v[5] = dst.f;
1019428d7b3dSmrg	v[6] = src_x * op->src.scale[0];
1020428d7b3dSmrg	v[7] = v[2];
1021428d7b3dSmrg	v[8] = msk_x * op->mask.scale[0];
1022428d7b3dSmrg	v[9] = v[4];
1023428d7b3dSmrg
1024428d7b3dSmrg	dst.p.y = r->dst.y;
1025428d7b3dSmrg	v[10] = dst.f;
1026428d7b3dSmrg	v[11] = v[6];
1027428d7b3dSmrg	v[12] = src_y * op->src.scale[1];
1028428d7b3dSmrg	v[13] = v[8];
1029428d7b3dSmrg	v[14] = msk_y * op->mask.scale[1];
1030428d7b3dSmrg}
1031428d7b3dSmrg
1032428d7b3dSmrgsse2 fastcall static void
1033428d7b3dSmrgemit_primitive_simple_source_identity(struct sna *sna,
1034428d7b3dSmrg				      const struct sna_composite_op *op,
1035428d7b3dSmrg				      const struct sna_composite_rectangles *r)
1036428d7b3dSmrg{
1037428d7b3dSmrg	float *v;
1038428d7b3dSmrg	union {
1039428d7b3dSmrg		struct sna_coordinate p;
1040428d7b3dSmrg		float f;
1041428d7b3dSmrg	} dst;
1042428d7b3dSmrg
1043428d7b3dSmrg	float xx = op->src.transform->matrix[0][0];
1044428d7b3dSmrg	float x0 = op->src.transform->matrix[0][2];
1045428d7b3dSmrg	float yy = op->src.transform->matrix[1][1];
1046428d7b3dSmrg	float y0 = op->src.transform->matrix[1][2];
1047428d7b3dSmrg	float sx = op->src.scale[0];
1048428d7b3dSmrg	float sy = op->src.scale[1];
1049428d7b3dSmrg	int16_t tx = op->src.offset[0];
1050428d7b3dSmrg	int16_t ty = op->src.offset[1];
1051428d7b3dSmrg	float msk_x = r->mask.x + op->mask.offset[0];
1052428d7b3dSmrg	float msk_y = r->mask.y + op->mask.offset[1];
1053428d7b3dSmrg	float w = r->width, h = r->height;
1054428d7b3dSmrg
1055428d7b3dSmrg	assert(op->floats_per_rect == 15);
1056428d7b3dSmrg	assert((sna->render.vertex_used % 5) == 0);
1057428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1058428d7b3dSmrg	sna->render.vertex_used += 3*5;
1059428d7b3dSmrg
1060428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1061428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1062428d7b3dSmrg	v[0] = dst.f;
1063428d7b3dSmrg	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
1064428d7b3dSmrg	v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
1065428d7b3dSmrg	v[3] = (msk_x + w) * op->mask.scale[0];
1066428d7b3dSmrg	v[4] = (msk_y + h) * op->mask.scale[1];
1067428d7b3dSmrg
1068428d7b3dSmrg	dst.p.x = r->dst.x;
1069428d7b3dSmrg	v[5] = dst.f;
1070428d7b3dSmrg	v[6] = ((r->src.x + tx) * xx + x0) * sx;
1071428d7b3dSmrg	v[7] = v[2];
1072428d7b3dSmrg	v[8] = msk_x * op->mask.scale[0];
1073428d7b3dSmrg	v[9] = v[4];
1074428d7b3dSmrg
1075428d7b3dSmrg	dst.p.y = r->dst.y;
1076428d7b3dSmrg	v[10] = dst.f;
1077428d7b3dSmrg	v[11] = v[6];
1078428d7b3dSmrg	v[12] = ((r->src.y + ty) * yy + y0) * sy;
1079428d7b3dSmrg	v[13] = v[8];
1080428d7b3dSmrg	v[14] = msk_y * op->mask.scale[1];
1081428d7b3dSmrg}
1082428d7b3dSmrg
1083428d7b3dSmrgsse2 fastcall static void
1084428d7b3dSmrgemit_primitive_affine_source_identity(struct sna *sna,
1085428d7b3dSmrg				      const struct sna_composite_op *op,
1086428d7b3dSmrg				      const struct sna_composite_rectangles *r)
1087428d7b3dSmrg{
1088428d7b3dSmrg	float *v;
1089428d7b3dSmrg	union {
1090428d7b3dSmrg		struct sna_coordinate p;
1091428d7b3dSmrg		float f;
1092428d7b3dSmrg	} dst;
1093428d7b3dSmrg	float msk_x = r->mask.x + op->mask.offset[0];
1094428d7b3dSmrg	float msk_y = r->mask.y + op->mask.offset[1];
1095428d7b3dSmrg	float w = r->width, h = r->height;
1096428d7b3dSmrg
1097428d7b3dSmrg	assert(op->floats_per_rect == 15);
1098428d7b3dSmrg	assert((sna->render.vertex_used % 5) == 0);
1099428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1100428d7b3dSmrg	sna->render.vertex_used += 3*5;
1101428d7b3dSmrg
1102428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1103428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1104428d7b3dSmrg	v[0] = dst.f;
1105428d7b3dSmrg	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x + r->width,
1106428d7b3dSmrg				    op->src.offset[1] + r->src.y + r->height,
1107428d7b3dSmrg				    op->src.transform, op->src.scale,
1108428d7b3dSmrg				    &v[1], &v[2]);
1109428d7b3dSmrg	v[3] = (msk_x + w) * op->mask.scale[0];
1110428d7b3dSmrg	v[4] = (msk_y + h) * op->mask.scale[1];
1111428d7b3dSmrg
1112428d7b3dSmrg	dst.p.x = r->dst.x;
1113428d7b3dSmrg	v[5] = dst.f;
1114428d7b3dSmrg	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
1115428d7b3dSmrg				    op->src.offset[1] + r->src.y + r->height,
1116428d7b3dSmrg				    op->src.transform, op->src.scale,
1117428d7b3dSmrg				    &v[6], &v[7]);
1118428d7b3dSmrg	v[8] = msk_x * op->mask.scale[0];
1119428d7b3dSmrg	v[9] = v[4];
1120428d7b3dSmrg
1121428d7b3dSmrg	dst.p.y = r->dst.y;
1122428d7b3dSmrg	v[10] = dst.f;
1123428d7b3dSmrg	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
1124428d7b3dSmrg				    op->src.offset[1] + r->src.y,
1125428d7b3dSmrg				    op->src.transform, op->src.scale,
1126428d7b3dSmrg				    &v[11], &v[12]);
1127428d7b3dSmrg	v[13] = v[8];
1128428d7b3dSmrg	v[14] = msk_y * op->mask.scale[1];
1129428d7b3dSmrg}
1130428d7b3dSmrg
1131428d7b3dSmrg/* SSE4_2 */
1132428d7b3dSmrg#if defined(sse4_2)
1133428d7b3dSmrg
1134428d7b3dSmrgsse4_2 fastcall static void
1135428d7b3dSmrgemit_primitive_linear__sse4_2(struct sna *sna,
1136428d7b3dSmrg			      const struct sna_composite_op *op,
1137428d7b3dSmrg			      const struct sna_composite_rectangles *r)
1138428d7b3dSmrg{
1139428d7b3dSmrg	float *v;
1140428d7b3dSmrg	union {
1141428d7b3dSmrg		struct sna_coordinate p;
1142428d7b3dSmrg		float f;
1143428d7b3dSmrg	} dst;
1144428d7b3dSmrg
1145428d7b3dSmrg	assert(op->floats_per_rect == 6);
1146428d7b3dSmrg	assert((sna->render.vertex_used % 2) == 0);
1147428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1148428d7b3dSmrg	sna->render.vertex_used += 6;
1149428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
1150428d7b3dSmrg
1151428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1152428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1153428d7b3dSmrg	v[0] = dst.f;
1154428d7b3dSmrg	dst.p.x = r->dst.x;
1155428d7b3dSmrg	v[2] = dst.f;
1156428d7b3dSmrg	dst.p.y = r->dst.y;
1157428d7b3dSmrg	v[4] = dst.f;
1158428d7b3dSmrg
1159428d7b3dSmrg	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
1160428d7b3dSmrg	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
1161428d7b3dSmrg	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
1162428d7b3dSmrg}
1163428d7b3dSmrg
1164428d7b3dSmrgsse4_2 fastcall static void
1165428d7b3dSmrgemit_boxes_linear__sse4_2(const struct sna_composite_op *op,
1166428d7b3dSmrg			  const BoxRec *box, int nbox,
1167428d7b3dSmrg			  float *v)
1168428d7b3dSmrg{
1169428d7b3dSmrg	union {
1170428d7b3dSmrg		struct sna_coordinate p;
1171428d7b3dSmrg		float f;
1172428d7b3dSmrg	} dst;
1173428d7b3dSmrg
1174428d7b3dSmrg	do {
1175428d7b3dSmrg		dst.p.x = box->x2;
1176428d7b3dSmrg		dst.p.y = box->y2;
1177428d7b3dSmrg		v[0] = dst.f;
1178428d7b3dSmrg		dst.p.x = box->x1;
1179428d7b3dSmrg		v[2] = dst.f;
1180428d7b3dSmrg		dst.p.y = box->y1;
1181428d7b3dSmrg		v[4] = dst.f;
1182428d7b3dSmrg
1183428d7b3dSmrg		v[1] = compute_linear(&op->src, box->x2, box->y2);
1184428d7b3dSmrg		v[3] = compute_linear(&op->src, box->x1, box->y2);
1185428d7b3dSmrg		v[5] = compute_linear(&op->src, box->x1, box->y1);
1186428d7b3dSmrg
1187428d7b3dSmrg		v += 6;
1188428d7b3dSmrg		box++;
1189428d7b3dSmrg	} while (--nbox);
1190428d7b3dSmrg}
1191428d7b3dSmrg
1192428d7b3dSmrgsse4_2 fastcall static void
1193428d7b3dSmrgemit_primitive_identity_source__sse4_2(struct sna *sna,
1194428d7b3dSmrg				       const struct sna_composite_op *op,
1195428d7b3dSmrg				       const struct sna_composite_rectangles *r)
1196428d7b3dSmrg{
1197428d7b3dSmrg	union {
1198428d7b3dSmrg		struct sna_coordinate p;
1199428d7b3dSmrg		float f;
1200428d7b3dSmrg	} dst;
1201428d7b3dSmrg	float *v;
1202428d7b3dSmrg
1203428d7b3dSmrg	assert(op->floats_per_rect == 9);
1204428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
1205428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1206428d7b3dSmrg	sna->render.vertex_used += 9;
1207428d7b3dSmrg
1208428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1209428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1210428d7b3dSmrg	v[0] = dst.f;
1211428d7b3dSmrg	dst.p.x = r->dst.x;
1212428d7b3dSmrg	v[3] = dst.f;
1213428d7b3dSmrg	dst.p.y = r->dst.y;
1214428d7b3dSmrg	v[6] = dst.f;
1215428d7b3dSmrg
1216428d7b3dSmrg	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
1217428d7b3dSmrg	v[1] = v[4] + r->width * op->src.scale[0];
1218428d7b3dSmrg
1219428d7b3dSmrg	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
1220428d7b3dSmrg	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
1221428d7b3dSmrg}
1222428d7b3dSmrg
1223428d7b3dSmrgsse4_2 fastcall static void
1224428d7b3dSmrgemit_boxes_identity_source__sse4_2(const struct sna_composite_op *op,
1225428d7b3dSmrg				   const BoxRec *box, int nbox,
1226428d7b3dSmrg				   float *v)
1227428d7b3dSmrg{
1228428d7b3dSmrg	do {
1229428d7b3dSmrg		union {
1230428d7b3dSmrg			struct sna_coordinate p;
1231428d7b3dSmrg			float f;
1232428d7b3dSmrg		} dst;
1233428d7b3dSmrg
1234428d7b3dSmrg		dst.p.x = box->x2;
1235428d7b3dSmrg		dst.p.y = box->y2;
1236428d7b3dSmrg		v[0] = dst.f;
1237428d7b3dSmrg		dst.p.x = box->x1;
1238428d7b3dSmrg		v[3] = dst.f;
1239428d7b3dSmrg		dst.p.y = box->y1;
1240428d7b3dSmrg		v[6] = dst.f;
1241428d7b3dSmrg
1242428d7b3dSmrg		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
1243428d7b3dSmrg		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
1244428d7b3dSmrg
1245428d7b3dSmrg		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
1246428d7b3dSmrg		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
1247428d7b3dSmrg
1248428d7b3dSmrg		v += 9;
1249428d7b3dSmrg		box++;
1250428d7b3dSmrg	} while (--nbox);
1251428d7b3dSmrg}
1252428d7b3dSmrg
1253428d7b3dSmrgsse4_2 fastcall static void
1254428d7b3dSmrgemit_primitive_simple_source__sse4_2(struct sna *sna,
1255428d7b3dSmrg				     const struct sna_composite_op *op,
1256428d7b3dSmrg				     const struct sna_composite_rectangles *r)
1257428d7b3dSmrg{
1258428d7b3dSmrg	float *v;
1259428d7b3dSmrg	union {
1260428d7b3dSmrg		struct sna_coordinate p;
1261428d7b3dSmrg		float f;
1262428d7b3dSmrg	} dst;
1263428d7b3dSmrg
1264428d7b3dSmrg	float xx = op->src.transform->matrix[0][0];
1265428d7b3dSmrg	float x0 = op->src.transform->matrix[0][2];
1266428d7b3dSmrg	float yy = op->src.transform->matrix[1][1];
1267428d7b3dSmrg	float y0 = op->src.transform->matrix[1][2];
1268428d7b3dSmrg	float sx = op->src.scale[0];
1269428d7b3dSmrg	float sy = op->src.scale[1];
1270428d7b3dSmrg	int16_t tx = op->src.offset[0];
1271428d7b3dSmrg	int16_t ty = op->src.offset[1];
1272428d7b3dSmrg
1273428d7b3dSmrg	assert(op->floats_per_rect == 9);
1274428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
1275428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1276428d7b3dSmrg	sna->render.vertex_used += 3*3;
1277428d7b3dSmrg
1278428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1279428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1280428d7b3dSmrg	v[0] = dst.f;
1281428d7b3dSmrg	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
1282428d7b3dSmrg	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
1283428d7b3dSmrg
1284428d7b3dSmrg	dst.p.x = r->dst.x;
1285428d7b3dSmrg	v[3] = dst.f;
1286428d7b3dSmrg	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
1287428d7b3dSmrg
1288428d7b3dSmrg	dst.p.y = r->dst.y;
1289428d7b3dSmrg	v[6] = dst.f;
1290428d7b3dSmrg	v[8] = ((r->src.y + ty) * yy + y0) * sy;
1291428d7b3dSmrg}
1292428d7b3dSmrg
1293428d7b3dSmrgsse4_2 fastcall static void
1294428d7b3dSmrgemit_boxes_simple_source__sse4_2(const struct sna_composite_op *op,
1295428d7b3dSmrg				 const BoxRec *box, int nbox,
1296428d7b3dSmrg				 float *v)
1297428d7b3dSmrg{
1298428d7b3dSmrg	float xx = op->src.transform->matrix[0][0];
1299428d7b3dSmrg	float x0 = op->src.transform->matrix[0][2];
1300428d7b3dSmrg	float yy = op->src.transform->matrix[1][1];
1301428d7b3dSmrg	float y0 = op->src.transform->matrix[1][2];
1302428d7b3dSmrg	float sx = op->src.scale[0];
1303428d7b3dSmrg	float sy = op->src.scale[1];
1304428d7b3dSmrg	int16_t tx = op->src.offset[0];
1305428d7b3dSmrg	int16_t ty = op->src.offset[1];
1306428d7b3dSmrg
1307428d7b3dSmrg	do {
1308428d7b3dSmrg		union {
1309428d7b3dSmrg			struct sna_coordinate p;
1310428d7b3dSmrg			float f;
1311428d7b3dSmrg		} dst;
1312428d7b3dSmrg
1313428d7b3dSmrg		dst.p.x = box->x2;
1314428d7b3dSmrg		dst.p.y = box->y2;
1315428d7b3dSmrg		v[0] = dst.f;
1316428d7b3dSmrg		v[1] = ((box->x2 + tx) * xx + x0) * sx;
1317428d7b3dSmrg		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
1318428d7b3dSmrg
1319428d7b3dSmrg		dst.p.x = box->x1;
1320428d7b3dSmrg		v[3] = dst.f;
1321428d7b3dSmrg		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
1322428d7b3dSmrg
1323428d7b3dSmrg		dst.p.y = box->y1;
1324428d7b3dSmrg		v[6] = dst.f;
1325428d7b3dSmrg		v[8] = ((box->y1 + ty) * yy + y0) * sy;
1326428d7b3dSmrg
1327428d7b3dSmrg		v += 9;
1328428d7b3dSmrg		box++;
1329428d7b3dSmrg	} while (--nbox);
1330428d7b3dSmrg}
1331428d7b3dSmrg
1332428d7b3dSmrgsse4_2 fastcall static void
1333428d7b3dSmrgemit_primitive_identity_mask__sse4_2(struct sna *sna,
1334428d7b3dSmrg				     const struct sna_composite_op *op,
1335428d7b3dSmrg				     const struct sna_composite_rectangles *r)
1336428d7b3dSmrg{
1337428d7b3dSmrg	union {
1338428d7b3dSmrg		struct sna_coordinate p;
1339428d7b3dSmrg		float f;
1340428d7b3dSmrg	} dst;
1341428d7b3dSmrg	float msk_x, msk_y;
1342428d7b3dSmrg	float w, h;
1343428d7b3dSmrg	float *v;
1344428d7b3dSmrg
1345428d7b3dSmrg	msk_x = r->mask.x + op->mask.offset[0];
1346428d7b3dSmrg	msk_y = r->mask.y + op->mask.offset[1];
1347428d7b3dSmrg	w = r->width;
1348428d7b3dSmrg	h = r->height;
1349428d7b3dSmrg
1350428d7b3dSmrg	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
1351428d7b3dSmrg	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
1352428d7b3dSmrg
1353428d7b3dSmrg	assert(op->floats_per_rect == 12);
1354428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
1355428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1356428d7b3dSmrg	sna->render.vertex_used += 12;
1357428d7b3dSmrg
1358428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1359428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1360428d7b3dSmrg	v[0] = dst.f;
1361428d7b3dSmrg	v[2] = (msk_x + w) * op->mask.scale[0];
1362428d7b3dSmrg	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
1363428d7b3dSmrg
1364428d7b3dSmrg	dst.p.x = r->dst.x;
1365428d7b3dSmrg	v[4] = dst.f;
1366428d7b3dSmrg	v[10] = v[6] = msk_x * op->mask.scale[0];
1367428d7b3dSmrg
1368428d7b3dSmrg	dst.p.y = r->dst.y;
1369428d7b3dSmrg	v[8] = dst.f;
1370428d7b3dSmrg	v[11] = msk_y * op->mask.scale[1];
1371428d7b3dSmrg
1372428d7b3dSmrg	v[9] = v[5] = v[1] = .5;
1373428d7b3dSmrg}
1374428d7b3dSmrg
1375428d7b3dSmrgsse4_2 fastcall static void
1376428d7b3dSmrgemit_boxes_identity_mask__sse4_2(const struct sna_composite_op *op,
1377428d7b3dSmrg				 const BoxRec *box, int nbox,
1378428d7b3dSmrg				 float *v)
1379428d7b3dSmrg{
1380428d7b3dSmrg	float msk_x = op->mask.offset[0];
1381428d7b3dSmrg	float msk_y = op->mask.offset[1];
1382428d7b3dSmrg
1383428d7b3dSmrg	do {
1384428d7b3dSmrg		union {
1385428d7b3dSmrg			struct sna_coordinate p;
1386428d7b3dSmrg			float f;
1387428d7b3dSmrg		} dst;
1388428d7b3dSmrg
1389428d7b3dSmrg		dst.p.x = box->x2;
1390428d7b3dSmrg		dst.p.y = box->y2;
1391428d7b3dSmrg		v[0] = dst.f;
1392428d7b3dSmrg		v[2] = (msk_x + box->x2) * op->mask.scale[0];
1393428d7b3dSmrg		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
1394428d7b3dSmrg
1395428d7b3dSmrg		dst.p.x = box->x1;
1396428d7b3dSmrg		v[4] = dst.f;
1397428d7b3dSmrg		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
1398428d7b3dSmrg
1399428d7b3dSmrg		dst.p.y = box->y1;
1400428d7b3dSmrg		v[8] = dst.f;
1401428d7b3dSmrg		v[11] = (msk_y + box->y1) * op->mask.scale[1];
1402428d7b3dSmrg
1403428d7b3dSmrg		v[9] = v[5] = v[1] = .5;
1404428d7b3dSmrg		v += 12;
1405428d7b3dSmrg		box++;
1406428d7b3dSmrg	} while (--nbox);
1407428d7b3dSmrg}
1408428d7b3dSmrg
1409428d7b3dSmrgsse4_2 fastcall static void
1410428d7b3dSmrgemit_primitive_linear_identity_mask__sse4_2(struct sna *sna,
1411428d7b3dSmrg					    const struct sna_composite_op *op,
1412428d7b3dSmrg					    const struct sna_composite_rectangles *r)
1413428d7b3dSmrg{
1414428d7b3dSmrg	union {
1415428d7b3dSmrg		struct sna_coordinate p;
1416428d7b3dSmrg		float f;
1417428d7b3dSmrg	} dst;
1418428d7b3dSmrg	float msk_x, msk_y;
1419428d7b3dSmrg	float w, h;
1420428d7b3dSmrg	float *v;
1421428d7b3dSmrg
1422428d7b3dSmrg	msk_x = r->mask.x + op->mask.offset[0];
1423428d7b3dSmrg	msk_y = r->mask.y + op->mask.offset[1];
1424428d7b3dSmrg	w = r->width;
1425428d7b3dSmrg	h = r->height;
1426428d7b3dSmrg
1427428d7b3dSmrg	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
1428428d7b3dSmrg	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
1429428d7b3dSmrg
1430428d7b3dSmrg	assert(op->floats_per_rect == 12);
1431428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
1432428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1433428d7b3dSmrg	sna->render.vertex_used += 12;
1434428d7b3dSmrg
1435428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1436428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1437428d7b3dSmrg	v[0] = dst.f;
1438428d7b3dSmrg	v[2] = (msk_x + w) * op->mask.scale[0];
1439428d7b3dSmrg	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
1440428d7b3dSmrg
1441428d7b3dSmrg	dst.p.x = r->dst.x;
1442428d7b3dSmrg	v[4] = dst.f;
1443428d7b3dSmrg	v[10] = v[6] = msk_x * op->mask.scale[0];
1444428d7b3dSmrg
1445428d7b3dSmrg	dst.p.y = r->dst.y;
1446428d7b3dSmrg	v[8] = dst.f;
1447428d7b3dSmrg	v[11] = msk_y * op->mask.scale[1];
1448428d7b3dSmrg
1449428d7b3dSmrg	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
1450428d7b3dSmrg	v[5] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
1451428d7b3dSmrg	v[9] = compute_linear(&op->src, r->src.x, r->src.y);
1452428d7b3dSmrg}
1453428d7b3dSmrg
1454428d7b3dSmrgsse4_2 fastcall static void
1455428d7b3dSmrgemit_boxes_linear_identity_mask__sse4_2(const struct sna_composite_op *op,
1456428d7b3dSmrg					const BoxRec *box, int nbox,
1457428d7b3dSmrg					float *v)
1458428d7b3dSmrg{
1459428d7b3dSmrg	float msk_x = op->mask.offset[0];
1460428d7b3dSmrg	float msk_y = op->mask.offset[1];
1461428d7b3dSmrg
1462428d7b3dSmrg	do {
1463428d7b3dSmrg		union {
1464428d7b3dSmrg			struct sna_coordinate p;
1465428d7b3dSmrg			float f;
1466428d7b3dSmrg		} dst;
1467428d7b3dSmrg
1468428d7b3dSmrg		dst.p.x = box->x2;
1469428d7b3dSmrg		dst.p.y = box->y2;
1470428d7b3dSmrg		v[0] = dst.f;
1471428d7b3dSmrg		v[2] = (msk_x + box->x2) * op->mask.scale[0];
1472428d7b3dSmrg		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
1473428d7b3dSmrg
1474428d7b3dSmrg		dst.p.x = box->x1;
1475428d7b3dSmrg		v[4] = dst.f;
1476428d7b3dSmrg		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
1477428d7b3dSmrg
1478428d7b3dSmrg		dst.p.y = box->y1;
1479428d7b3dSmrg		v[8] = dst.f;
1480428d7b3dSmrg		v[11] = (msk_y + box->y1) * op->mask.scale[1];
1481428d7b3dSmrg
1482428d7b3dSmrg		v[1] = compute_linear(&op->src, box->x2, box->y2);
1483428d7b3dSmrg		v[5] = compute_linear(&op->src, box->x1, box->y2);
1484428d7b3dSmrg		v[9] = compute_linear(&op->src, box->x1, box->y1);
1485428d7b3dSmrg
1486428d7b3dSmrg		v += 12;
1487428d7b3dSmrg		box++;
1488428d7b3dSmrg	} while (--nbox);
1489428d7b3dSmrg}
1490428d7b3dSmrg
1491428d7b3dSmrg#endif
1492428d7b3dSmrg
1493428d7b3dSmrg/* AVX2 */
1494428d7b3dSmrg#if defined(avx2)
1495428d7b3dSmrg
1496428d7b3dSmrgavx2 fastcall static void
1497428d7b3dSmrgemit_primitive_linear__avx2(struct sna *sna,
1498428d7b3dSmrg			    const struct sna_composite_op *op,
1499428d7b3dSmrg			    const struct sna_composite_rectangles *r)
1500428d7b3dSmrg{
1501428d7b3dSmrg	float *v;
1502428d7b3dSmrg	union {
1503428d7b3dSmrg		struct sna_coordinate p;
1504428d7b3dSmrg		float f;
1505428d7b3dSmrg	} dst;
1506428d7b3dSmrg
1507428d7b3dSmrg	assert(op->floats_per_rect == 6);
1508428d7b3dSmrg	assert((sna->render.vertex_used % 2) == 0);
1509428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1510428d7b3dSmrg	sna->render.vertex_used += 6;
1511428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
1512428d7b3dSmrg
1513428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1514428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1515428d7b3dSmrg	v[0] = dst.f;
1516428d7b3dSmrg	dst.p.x = r->dst.x;
1517428d7b3dSmrg	v[2] = dst.f;
1518428d7b3dSmrg	dst.p.y = r->dst.y;
1519428d7b3dSmrg	v[4] = dst.f;
1520428d7b3dSmrg
1521428d7b3dSmrg	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
1522428d7b3dSmrg	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
1523428d7b3dSmrg	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
1524428d7b3dSmrg}
1525428d7b3dSmrg
1526428d7b3dSmrgavx2 fastcall static void
1527428d7b3dSmrgemit_boxes_linear__avx2(const struct sna_composite_op *op,
1528428d7b3dSmrg			const BoxRec *box, int nbox,
1529428d7b3dSmrg			float *v)
1530428d7b3dSmrg{
1531428d7b3dSmrg	union {
1532428d7b3dSmrg		struct sna_coordinate p;
1533428d7b3dSmrg		float f;
1534428d7b3dSmrg	} dst;
1535428d7b3dSmrg
1536428d7b3dSmrg	do {
1537428d7b3dSmrg		dst.p.x = box->x2;
1538428d7b3dSmrg		dst.p.y = box->y2;
1539428d7b3dSmrg		v[0] = dst.f;
1540428d7b3dSmrg		dst.p.x = box->x1;
1541428d7b3dSmrg		v[2] = dst.f;
1542428d7b3dSmrg		dst.p.y = box->y1;
1543428d7b3dSmrg		v[4] = dst.f;
1544428d7b3dSmrg
1545428d7b3dSmrg		v[1] = compute_linear(&op->src, box->x2, box->y2);
1546428d7b3dSmrg		v[3] = compute_linear(&op->src, box->x1, box->y2);
1547428d7b3dSmrg		v[5] = compute_linear(&op->src, box->x1, box->y1);
1548428d7b3dSmrg
1549428d7b3dSmrg		v += 6;
1550428d7b3dSmrg		box++;
1551428d7b3dSmrg	} while (--nbox);
1552428d7b3dSmrg}
1553428d7b3dSmrg
1554428d7b3dSmrgavx2 fastcall static void
1555428d7b3dSmrgemit_primitive_identity_source__avx2(struct sna *sna,
1556428d7b3dSmrg				     const struct sna_composite_op *op,
1557428d7b3dSmrg				     const struct sna_composite_rectangles *r)
1558428d7b3dSmrg{
1559428d7b3dSmrg	union {
1560428d7b3dSmrg		struct sna_coordinate p;
1561428d7b3dSmrg		float f;
1562428d7b3dSmrg	} dst;
1563428d7b3dSmrg	float *v;
1564428d7b3dSmrg
1565428d7b3dSmrg	assert(op->floats_per_rect == 9);
1566428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
1567428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1568428d7b3dSmrg	sna->render.vertex_used += 9;
1569428d7b3dSmrg
1570428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1571428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1572428d7b3dSmrg	v[0] = dst.f;
1573428d7b3dSmrg	dst.p.x = r->dst.x;
1574428d7b3dSmrg	v[3] = dst.f;
1575428d7b3dSmrg	dst.p.y = r->dst.y;
1576428d7b3dSmrg	v[6] = dst.f;
1577428d7b3dSmrg
1578428d7b3dSmrg	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
1579428d7b3dSmrg	v[1] = v[4] + r->width * op->src.scale[0];
1580428d7b3dSmrg
1581428d7b3dSmrg	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
1582428d7b3dSmrg	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
1583428d7b3dSmrg}
1584428d7b3dSmrg
1585428d7b3dSmrgavx2 fastcall static void
1586428d7b3dSmrgemit_boxes_identity_source__avx2(const struct sna_composite_op *op,
1587428d7b3dSmrg				 const BoxRec *box, int nbox,
1588428d7b3dSmrg				 float *v)
1589428d7b3dSmrg{
1590428d7b3dSmrg	do {
1591428d7b3dSmrg		union {
1592428d7b3dSmrg			struct sna_coordinate p;
1593428d7b3dSmrg			float f;
1594428d7b3dSmrg		} dst;
1595428d7b3dSmrg
1596428d7b3dSmrg		dst.p.x = box->x2;
1597428d7b3dSmrg		dst.p.y = box->y2;
1598428d7b3dSmrg		v[0] = dst.f;
1599428d7b3dSmrg		dst.p.x = box->x1;
1600428d7b3dSmrg		v[3] = dst.f;
1601428d7b3dSmrg		dst.p.y = box->y1;
1602428d7b3dSmrg		v[6] = dst.f;
1603428d7b3dSmrg
1604428d7b3dSmrg		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
1605428d7b3dSmrg		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
1606428d7b3dSmrg
1607428d7b3dSmrg		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
1608428d7b3dSmrg		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
1609428d7b3dSmrg
1610428d7b3dSmrg		v += 9;
1611428d7b3dSmrg		box++;
1612428d7b3dSmrg	} while (--nbox);
1613428d7b3dSmrg}
1614428d7b3dSmrg
1615428d7b3dSmrgavx2 fastcall static void
1616428d7b3dSmrgemit_primitive_simple_source__avx2(struct sna *sna,
1617428d7b3dSmrg				   const struct sna_composite_op *op,
1618428d7b3dSmrg				   const struct sna_composite_rectangles *r)
1619428d7b3dSmrg{
1620428d7b3dSmrg	float *v;
1621428d7b3dSmrg	union {
1622428d7b3dSmrg		struct sna_coordinate p;
1623428d7b3dSmrg		float f;
1624428d7b3dSmrg	} dst;
1625428d7b3dSmrg
1626428d7b3dSmrg	float xx = op->src.transform->matrix[0][0];
1627428d7b3dSmrg	float x0 = op->src.transform->matrix[0][2];
1628428d7b3dSmrg	float yy = op->src.transform->matrix[1][1];
1629428d7b3dSmrg	float y0 = op->src.transform->matrix[1][2];
1630428d7b3dSmrg	float sx = op->src.scale[0];
1631428d7b3dSmrg	float sy = op->src.scale[1];
1632428d7b3dSmrg	int16_t tx = op->src.offset[0];
1633428d7b3dSmrg	int16_t ty = op->src.offset[1];
1634428d7b3dSmrg
1635428d7b3dSmrg	assert(op->floats_per_rect == 9);
1636428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
1637428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1638428d7b3dSmrg	sna->render.vertex_used += 3*3;
1639428d7b3dSmrg
1640428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1641428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1642428d7b3dSmrg	v[0] = dst.f;
1643428d7b3dSmrg	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
1644428d7b3dSmrg	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
1645428d7b3dSmrg
1646428d7b3dSmrg	dst.p.x = r->dst.x;
1647428d7b3dSmrg	v[3] = dst.f;
1648428d7b3dSmrg	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
1649428d7b3dSmrg
1650428d7b3dSmrg	dst.p.y = r->dst.y;
1651428d7b3dSmrg	v[6] = dst.f;
1652428d7b3dSmrg	v[8] = ((r->src.y + ty) * yy + y0) * sy;
1653428d7b3dSmrg}
1654428d7b3dSmrg
1655428d7b3dSmrgavx2 fastcall static void
1656428d7b3dSmrgemit_boxes_simple_source__avx2(const struct sna_composite_op *op,
1657428d7b3dSmrg			       const BoxRec *box, int nbox,
1658428d7b3dSmrg			       float *v)
1659428d7b3dSmrg{
1660428d7b3dSmrg	float xx = op->src.transform->matrix[0][0];
1661428d7b3dSmrg	float x0 = op->src.transform->matrix[0][2];
1662428d7b3dSmrg	float yy = op->src.transform->matrix[1][1];
1663428d7b3dSmrg	float y0 = op->src.transform->matrix[1][2];
1664428d7b3dSmrg	float sx = op->src.scale[0];
1665428d7b3dSmrg	float sy = op->src.scale[1];
1666428d7b3dSmrg	int16_t tx = op->src.offset[0];
1667428d7b3dSmrg	int16_t ty = op->src.offset[1];
1668428d7b3dSmrg
1669428d7b3dSmrg	do {
1670428d7b3dSmrg		union {
1671428d7b3dSmrg			struct sna_coordinate p;
1672428d7b3dSmrg			float f;
1673428d7b3dSmrg		} dst;
1674428d7b3dSmrg
1675428d7b3dSmrg		dst.p.x = box->x2;
1676428d7b3dSmrg		dst.p.y = box->y2;
1677428d7b3dSmrg		v[0] = dst.f;
1678428d7b3dSmrg		v[1] = ((box->x2 + tx) * xx + x0) * sx;
1679428d7b3dSmrg		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
1680428d7b3dSmrg
1681428d7b3dSmrg		dst.p.x = box->x1;
1682428d7b3dSmrg		v[3] = dst.f;
1683428d7b3dSmrg		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
1684428d7b3dSmrg
1685428d7b3dSmrg		dst.p.y = box->y1;
1686428d7b3dSmrg		v[6] = dst.f;
1687428d7b3dSmrg		v[8] = ((box->y1 + ty) * yy + y0) * sy;
1688428d7b3dSmrg
1689428d7b3dSmrg		v += 9;
1690428d7b3dSmrg		box++;
1691428d7b3dSmrg	} while (--nbox);
1692428d7b3dSmrg}
1693428d7b3dSmrg
1694428d7b3dSmrgavx2 fastcall static void
1695428d7b3dSmrgemit_primitive_identity_mask__avx2(struct sna *sna,
1696428d7b3dSmrg				   const struct sna_composite_op *op,
1697428d7b3dSmrg				   const struct sna_composite_rectangles *r)
1698428d7b3dSmrg{
1699428d7b3dSmrg	union {
1700428d7b3dSmrg		struct sna_coordinate p;
1701428d7b3dSmrg		float f;
1702428d7b3dSmrg	} dst;
1703428d7b3dSmrg	float msk_x, msk_y;
1704428d7b3dSmrg	float w, h;
1705428d7b3dSmrg	float *v;
1706428d7b3dSmrg
1707428d7b3dSmrg	msk_x = r->mask.x + op->mask.offset[0];
1708428d7b3dSmrg	msk_y = r->mask.y + op->mask.offset[1];
1709428d7b3dSmrg	w = r->width;
1710428d7b3dSmrg	h = r->height;
1711428d7b3dSmrg
1712428d7b3dSmrg	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
1713428d7b3dSmrg	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
1714428d7b3dSmrg
1715428d7b3dSmrg	assert(op->floats_per_rect == 12);
1716428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
1717428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1718428d7b3dSmrg	sna->render.vertex_used += 12;
1719428d7b3dSmrg
1720428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1721428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1722428d7b3dSmrg	v[0] = dst.f;
1723428d7b3dSmrg	v[2] = (msk_x + w) * op->mask.scale[0];
1724428d7b3dSmrg	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
1725428d7b3dSmrg
1726428d7b3dSmrg	dst.p.x = r->dst.x;
1727428d7b3dSmrg	v[4] = dst.f;
1728428d7b3dSmrg	v[10] = v[6] = msk_x * op->mask.scale[0];
1729428d7b3dSmrg
1730428d7b3dSmrg	dst.p.y = r->dst.y;
1731428d7b3dSmrg	v[8] = dst.f;
1732428d7b3dSmrg	v[11] = msk_y * op->mask.scale[1];
1733428d7b3dSmrg
1734428d7b3dSmrg	v[9] = v[5] = v[1] = .5;
1735428d7b3dSmrg}
1736428d7b3dSmrg
1737428d7b3dSmrgavx2 fastcall static void
1738428d7b3dSmrgemit_boxes_identity_mask__avx2(const struct sna_composite_op *op,
1739428d7b3dSmrg			       const BoxRec *box, int nbox,
1740428d7b3dSmrg			       float *v)
1741428d7b3dSmrg{
1742428d7b3dSmrg	float msk_x = op->mask.offset[0];
1743428d7b3dSmrg	float msk_y = op->mask.offset[1];
1744428d7b3dSmrg
1745428d7b3dSmrg	do {
1746428d7b3dSmrg		union {
1747428d7b3dSmrg			struct sna_coordinate p;
1748428d7b3dSmrg			float f;
1749428d7b3dSmrg		} dst;
1750428d7b3dSmrg
1751428d7b3dSmrg		dst.p.x = box->x2;
1752428d7b3dSmrg		dst.p.y = box->y2;
1753428d7b3dSmrg		v[0] = dst.f;
1754428d7b3dSmrg		v[2] = (msk_x + box->x2) * op->mask.scale[0];
1755428d7b3dSmrg		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
1756428d7b3dSmrg
1757428d7b3dSmrg		dst.p.x = box->x1;
1758428d7b3dSmrg		v[4] = dst.f;
1759428d7b3dSmrg		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
1760428d7b3dSmrg
1761428d7b3dSmrg		dst.p.y = box->y1;
1762428d7b3dSmrg		v[8] = dst.f;
1763428d7b3dSmrg		v[11] = (msk_y + box->y1) * op->mask.scale[1];
1764428d7b3dSmrg
1765428d7b3dSmrg		v[9] = v[5] = v[1] = .5;
1766428d7b3dSmrg		v += 12;
1767428d7b3dSmrg		box++;
1768428d7b3dSmrg	} while (--nbox);
1769428d7b3dSmrg}
1770428d7b3dSmrg
1771428d7b3dSmrgavx2 fastcall static void
1772428d7b3dSmrgemit_primitive_linear_identity_mask__avx2(struct sna *sna,
1773428d7b3dSmrg					  const struct sna_composite_op *op,
1774428d7b3dSmrg					  const struct sna_composite_rectangles *r)
1775428d7b3dSmrg{
1776428d7b3dSmrg	union {
1777428d7b3dSmrg		struct sna_coordinate p;
1778428d7b3dSmrg		float f;
1779428d7b3dSmrg	} dst;
1780428d7b3dSmrg	float msk_x, msk_y;
1781428d7b3dSmrg	float w, h;
1782428d7b3dSmrg	float *v;
1783428d7b3dSmrg
1784428d7b3dSmrg	msk_x = r->mask.x + op->mask.offset[0];
1785428d7b3dSmrg	msk_y = r->mask.y + op->mask.offset[1];
1786428d7b3dSmrg	w = r->width;
1787428d7b3dSmrg	h = r->height;
1788428d7b3dSmrg
1789428d7b3dSmrg	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
1790428d7b3dSmrg	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
1791428d7b3dSmrg
1792428d7b3dSmrg	assert(op->floats_per_rect == 12);
1793428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
1794428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
1795428d7b3dSmrg	sna->render.vertex_used += 12;
1796428d7b3dSmrg
1797428d7b3dSmrg	dst.p.x = r->dst.x + r->width;
1798428d7b3dSmrg	dst.p.y = r->dst.y + r->height;
1799428d7b3dSmrg	v[0] = dst.f;
1800428d7b3dSmrg	v[2] = (msk_x + w) * op->mask.scale[0];
1801428d7b3dSmrg	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
1802428d7b3dSmrg
1803428d7b3dSmrg	dst.p.x = r->dst.x;
1804428d7b3dSmrg	v[4] = dst.f;
1805428d7b3dSmrg	v[10] = v[6] = msk_x * op->mask.scale[0];
1806428d7b3dSmrg
1807428d7b3dSmrg	dst.p.y = r->dst.y;
1808428d7b3dSmrg	v[8] = dst.f;
1809428d7b3dSmrg	v[11] = msk_y * op->mask.scale[1];
1810428d7b3dSmrg
1811428d7b3dSmrg	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
1812428d7b3dSmrg	v[5] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
1813428d7b3dSmrg	v[9] = compute_linear(&op->src, r->src.x, r->src.y);
1814428d7b3dSmrg}
1815428d7b3dSmrg
1816428d7b3dSmrgavx2 fastcall static void
1817428d7b3dSmrgemit_boxes_linear_identity_mask__avx2(const struct sna_composite_op *op,
1818428d7b3dSmrg				      const BoxRec *box, int nbox,
1819428d7b3dSmrg				      float *v)
1820428d7b3dSmrg{
1821428d7b3dSmrg	float msk_x = op->mask.offset[0];
1822428d7b3dSmrg	float msk_y = op->mask.offset[1];
1823428d7b3dSmrg
1824428d7b3dSmrg	do {
1825428d7b3dSmrg		union {
1826428d7b3dSmrg			struct sna_coordinate p;
1827428d7b3dSmrg			float f;
1828428d7b3dSmrg		} dst;
1829428d7b3dSmrg
1830428d7b3dSmrg		dst.p.x = box->x2;
1831428d7b3dSmrg		dst.p.y = box->y2;
1832428d7b3dSmrg		v[0] = dst.f;
1833428d7b3dSmrg		v[2] = (msk_x + box->x2) * op->mask.scale[0];
1834428d7b3dSmrg		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
1835428d7b3dSmrg
1836428d7b3dSmrg		dst.p.x = box->x1;
1837428d7b3dSmrg		v[4] = dst.f;
1838428d7b3dSmrg		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
1839428d7b3dSmrg
1840428d7b3dSmrg		dst.p.y = box->y1;
1841428d7b3dSmrg		v[8] = dst.f;
1842428d7b3dSmrg		v[11] = (msk_y + box->y1) * op->mask.scale[1];
1843428d7b3dSmrg
1844428d7b3dSmrg		v[1] = compute_linear(&op->src, box->x2, box->y2);
1845428d7b3dSmrg		v[5] = compute_linear(&op->src, box->x1, box->y2);
1846428d7b3dSmrg		v[9] = compute_linear(&op->src, box->x1, box->y1);
1847428d7b3dSmrg
1848428d7b3dSmrg		v += 12;
1849428d7b3dSmrg		box++;
1850428d7b3dSmrg	} while (--nbox);
1851428d7b3dSmrg}
1852428d7b3dSmrg
1853428d7b3dSmrg#endif
1854428d7b3dSmrg
1855428d7b3dSmrgunsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp)
1856428d7b3dSmrg{
1857428d7b3dSmrg	unsigned vb;
1858428d7b3dSmrg
1859428d7b3dSmrg	if (tmp->mask.bo) {
1860428d7b3dSmrg		if (tmp->mask.transform == NULL) {
1861428d7b3dSmrg			if (tmp->src.is_solid) {
1862428d7b3dSmrg				DBG(("%s: solid, identity mask\n", __FUNCTION__));
1863428d7b3dSmrg#if defined(avx2)
1864428d7b3dSmrg				if (sna->cpu_features & AVX2) {
1865428d7b3dSmrg					tmp->prim_emit = emit_primitive_identity_mask__avx2;
1866428d7b3dSmrg					tmp->emit_boxes = emit_boxes_identity_mask__avx2;
1867428d7b3dSmrg				} else
1868428d7b3dSmrg#endif
1869428d7b3dSmrg#if defined(sse4_2)
1870428d7b3dSmrg				if (sna->cpu_features & SSE4_2) {
1871428d7b3dSmrg					tmp->prim_emit = emit_primitive_identity_mask__sse4_2;
1872428d7b3dSmrg					tmp->emit_boxes = emit_boxes_identity_mask__sse4_2;
1873428d7b3dSmrg				} else
1874428d7b3dSmrg#endif
1875428d7b3dSmrg				{
1876428d7b3dSmrg					tmp->prim_emit = emit_primitive_identity_mask;
1877428d7b3dSmrg					tmp->emit_boxes = emit_boxes_identity_mask;
1878428d7b3dSmrg				}
1879428d7b3dSmrg				tmp->floats_per_vertex = 4;
1880428d7b3dSmrg				vb = 1 | 2 << 2;
1881428d7b3dSmrg			} else if (tmp->src.is_linear) {
1882428d7b3dSmrg				DBG(("%s: linear, identity mask\n", __FUNCTION__));
1883428d7b3dSmrg#if defined(avx2)
1884428d7b3dSmrg				if (sna->cpu_features & AVX2) {
1885428d7b3dSmrg					tmp->prim_emit = emit_primitive_linear_identity_mask__avx2;
1886428d7b3dSmrg					tmp->emit_boxes = emit_boxes_linear_identity_mask__avx2;
1887428d7b3dSmrg				} else
1888428d7b3dSmrg#endif
1889428d7b3dSmrg#if defined(sse4_2)
1890428d7b3dSmrg				if (sna->cpu_features & SSE4_2) {
1891428d7b3dSmrg					tmp->prim_emit = emit_primitive_linear_identity_mask__sse4_2;
1892428d7b3dSmrg					tmp->emit_boxes = emit_boxes_linear_identity_mask__sse4_2;
1893428d7b3dSmrg				} else
1894428d7b3dSmrg#endif
1895428d7b3dSmrg				{
1896428d7b3dSmrg					tmp->prim_emit = emit_primitive_linear_identity_mask;
1897428d7b3dSmrg					tmp->emit_boxes = emit_boxes_linear_identity_mask;
1898428d7b3dSmrg				}
1899428d7b3dSmrg				tmp->floats_per_vertex = 4;
1900428d7b3dSmrg				vb = 1 | 2 << 2;
1901428d7b3dSmrg			} else if (tmp->src.transform == NULL) {
1902428d7b3dSmrg				DBG(("%s: identity source, identity mask\n", __FUNCTION__));
1903428d7b3dSmrg				tmp->prim_emit = emit_primitive_identity_source_mask;
1904428d7b3dSmrg				tmp->floats_per_vertex = 5;
1905428d7b3dSmrg				vb = 2 << 2 | 2;
1906428d7b3dSmrg			} else if (tmp->src.is_affine) {
1907428d7b3dSmrg				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
1908428d7b3dSmrg				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
1909428d7b3dSmrg				if (!sna_affine_transform_is_rotation(tmp->src.transform)) {
1910428d7b3dSmrg					DBG(("%s: simple src, identity mask\n", __FUNCTION__));
1911428d7b3dSmrg					tmp->prim_emit = emit_primitive_simple_source_identity;
1912428d7b3dSmrg				} else {
1913428d7b3dSmrg					DBG(("%s: affine src, identity mask\n", __FUNCTION__));
1914428d7b3dSmrg					tmp->prim_emit = emit_primitive_affine_source_identity;
1915428d7b3dSmrg				}
1916428d7b3dSmrg				tmp->floats_per_vertex = 5;
1917428d7b3dSmrg				vb = 2 << 2 | 2;
1918428d7b3dSmrg			} else {
1919428d7b3dSmrg				DBG(("%s: projective source, identity mask\n", __FUNCTION__));
1920428d7b3dSmrg				tmp->prim_emit = emit_primitive_mask;
1921428d7b3dSmrg				tmp->floats_per_vertex = 6;
1922428d7b3dSmrg				vb = 2 << 2 | 3;
1923428d7b3dSmrg			}
1924428d7b3dSmrg		} else {
1925428d7b3dSmrg			tmp->prim_emit = emit_primitive_mask;
1926428d7b3dSmrg			tmp->emit_boxes = emit_boxes_mask;
1927428d7b3dSmrg			tmp->floats_per_vertex = 1;
1928428d7b3dSmrg			vb = 0;
1929428d7b3dSmrg			if (tmp->mask.is_solid) {
1930428d7b3dSmrg				tmp->floats_per_vertex += 1;
1931428d7b3dSmrg				vb |= 1 << 2;
1932428d7b3dSmrg			} else if (tmp->mask.is_affine) {
1933428d7b3dSmrg				tmp->floats_per_vertex += 2;
1934428d7b3dSmrg				vb |= 2 << 2;
1935428d7b3dSmrg			}else {
1936428d7b3dSmrg				tmp->floats_per_vertex += 3;
1937428d7b3dSmrg				vb |= 3 << 2;
1938428d7b3dSmrg			}
1939428d7b3dSmrg			if (tmp->src.is_solid) {
1940428d7b3dSmrg				tmp->floats_per_vertex += 1;
1941428d7b3dSmrg				vb |= 1;
1942428d7b3dSmrg			} else if (tmp->src.is_affine) {
1943428d7b3dSmrg				tmp->floats_per_vertex += 2;
1944428d7b3dSmrg				vb |= 2 ;
1945428d7b3dSmrg			}else {
1946428d7b3dSmrg				tmp->floats_per_vertex += 3;
1947428d7b3dSmrg				vb |= 3;
1948428d7b3dSmrg			}
1949428d7b3dSmrg			DBG(("%s: general mask: floats-per-vertex=%d, vb=%x\n",
1950428d7b3dSmrg			     __FUNCTION__,tmp->floats_per_vertex, vb));
1951428d7b3dSmrg		}
1952428d7b3dSmrg	} else {
1953428d7b3dSmrg		if (tmp->src.is_solid) {
1954428d7b3dSmrg			DBG(("%s: solid, no mask\n", __FUNCTION__));
1955428d7b3dSmrg			tmp->prim_emit = emit_primitive_solid;
1956428d7b3dSmrg			tmp->emit_boxes = emit_boxes_solid;
1957428d7b3dSmrg			if (tmp->src.is_opaque && tmp->op == PictOpOver)
1958428d7b3dSmrg				tmp->op = PictOpSrc;
1959428d7b3dSmrg			tmp->floats_per_vertex = 2;
1960428d7b3dSmrg			vb = 1;
1961428d7b3dSmrg		} else if (tmp->src.is_linear) {
1962428d7b3dSmrg			DBG(("%s: linear, no mask\n", __FUNCTION__));
1963428d7b3dSmrg#if defined(avx2)
1964428d7b3dSmrg			if (sna->cpu_features & AVX2) {
1965428d7b3dSmrg				tmp->prim_emit = emit_primitive_linear__avx2;
1966428d7b3dSmrg				tmp->emit_boxes = emit_boxes_linear__avx2;
1967428d7b3dSmrg			} else
1968428d7b3dSmrg#endif
1969428d7b3dSmrg#if defined(sse4_2)
1970428d7b3dSmrg			if (sna->cpu_features & SSE4_2) {
1971428d7b3dSmrg				tmp->prim_emit = emit_primitive_linear__sse4_2;
1972428d7b3dSmrg				tmp->emit_boxes = emit_boxes_linear__sse4_2;
1973428d7b3dSmrg			} else
1974428d7b3dSmrg#endif
1975428d7b3dSmrg			{
1976428d7b3dSmrg				tmp->prim_emit = emit_primitive_linear;
1977428d7b3dSmrg				tmp->emit_boxes = emit_boxes_linear;
1978428d7b3dSmrg			}
1979428d7b3dSmrg			tmp->floats_per_vertex = 2;
1980428d7b3dSmrg			vb = 1;
1981428d7b3dSmrg		} else if (tmp->src.transform == NULL) {
1982428d7b3dSmrg			DBG(("%s: identity src, no mask\n", __FUNCTION__));
1983428d7b3dSmrg#if defined(avx2)
1984428d7b3dSmrg			if (sna->cpu_features & AVX2) {
1985428d7b3dSmrg				tmp->prim_emit = emit_primitive_identity_source__avx2;
1986428d7b3dSmrg				tmp->emit_boxes = emit_boxes_identity_source__avx2;
1987428d7b3dSmrg			} else
1988428d7b3dSmrg#endif
1989428d7b3dSmrg#if defined(sse4_2)
1990428d7b3dSmrg			if (sna->cpu_features & SSE4_2) {
1991428d7b3dSmrg				tmp->prim_emit = emit_primitive_identity_source__sse4_2;
1992428d7b3dSmrg				tmp->emit_boxes = emit_boxes_identity_source__sse4_2;
1993428d7b3dSmrg			} else
1994428d7b3dSmrg#endif
1995428d7b3dSmrg			{
1996428d7b3dSmrg				tmp->prim_emit = emit_primitive_identity_source;
1997428d7b3dSmrg				tmp->emit_boxes = emit_boxes_identity_source;
1998428d7b3dSmrg			}
1999428d7b3dSmrg			tmp->floats_per_vertex = 3;
2000428d7b3dSmrg			vb = 2;
2001428d7b3dSmrg		} else if (tmp->src.is_affine) {
2002428d7b3dSmrg			tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
2003428d7b3dSmrg			tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
2004428d7b3dSmrg			if (!sna_affine_transform_is_rotation(tmp->src.transform)) {
2005428d7b3dSmrg				DBG(("%s: simple src, no mask\n", __FUNCTION__));
2006428d7b3dSmrg#if defined(avx2)
2007428d7b3dSmrg				if (sna->cpu_features & AVX2) {
2008428d7b3dSmrg					tmp->prim_emit = emit_primitive_simple_source__avx2;
2009428d7b3dSmrg					tmp->emit_boxes = emit_boxes_simple_source__avx2;
2010428d7b3dSmrg				} else
2011428d7b3dSmrg#endif
2012428d7b3dSmrg#if defined(sse4_2)
2013428d7b3dSmrg				if (sna->cpu_features & SSE4_2) {
2014428d7b3dSmrg					tmp->prim_emit = emit_primitive_simple_source__sse4_2;
2015428d7b3dSmrg					tmp->emit_boxes = emit_boxes_simple_source__sse4_2;
2016428d7b3dSmrg				} else
2017428d7b3dSmrg#endif
2018428d7b3dSmrg				{
2019428d7b3dSmrg					tmp->prim_emit = emit_primitive_simple_source;
2020428d7b3dSmrg					tmp->emit_boxes = emit_boxes_simple_source;
2021428d7b3dSmrg				}
2022428d7b3dSmrg			} else {
2023428d7b3dSmrg				DBG(("%s: affine src, no mask\n", __FUNCTION__));
2024428d7b3dSmrg				tmp->prim_emit = emit_primitive_affine_source;
2025428d7b3dSmrg				tmp->emit_boxes = emit_boxes_affine_source;
2026428d7b3dSmrg			}
2027428d7b3dSmrg			tmp->floats_per_vertex = 3;
2028428d7b3dSmrg			vb = 2;
2029428d7b3dSmrg		} else {
2030428d7b3dSmrg			DBG(("%s: projective src, no mask\n", __FUNCTION__));
2031428d7b3dSmrg			assert(!tmp->src.is_solid);
2032428d7b3dSmrg			tmp->prim_emit = emit_primitive;
2033428d7b3dSmrg			tmp->emit_boxes = emit_boxes;
2034428d7b3dSmrg			tmp->floats_per_vertex = 4;
2035428d7b3dSmrg			vb = 3;
2036428d7b3dSmrg		}
2037428d7b3dSmrg	}
2038428d7b3dSmrg	tmp->floats_per_rect = 3 * tmp->floats_per_vertex;
2039428d7b3dSmrg
2040428d7b3dSmrg	return vb;
2041428d7b3dSmrg}
2042428d7b3dSmrg
2043428d7b3dSmrgsse2 force_inline static void
2044428d7b3dSmrgemit_span_vertex(struct sna *sna,
2045428d7b3dSmrg		  const struct sna_composite_spans_op *op,
2046428d7b3dSmrg		  int16_t x, int16_t y)
2047428d7b3dSmrg{
2048428d7b3dSmrg	OUT_VERTEX(x, y);
2049428d7b3dSmrg	emit_texcoord(sna, &op->base.src, x, y);
2050428d7b3dSmrg}
2051428d7b3dSmrg
2052428d7b3dSmrgsse2 fastcall static void
2053428d7b3dSmrgemit_span_primitive(struct sna *sna,
2054428d7b3dSmrg		    const struct sna_composite_spans_op *op,
2055428d7b3dSmrg		    const BoxRec *box,
2056428d7b3dSmrg		    float opacity)
2057428d7b3dSmrg{
2058428d7b3dSmrg	emit_span_vertex(sna, op, box->x2, box->y2);
2059428d7b3dSmrg	OUT_VERTEX_F(opacity);
2060428d7b3dSmrg
2061428d7b3dSmrg	emit_span_vertex(sna, op, box->x1, box->y2);
2062428d7b3dSmrg	OUT_VERTEX_F(opacity);
2063428d7b3dSmrg
2064428d7b3dSmrg	emit_span_vertex(sna, op, box->x1, box->y1);
2065428d7b3dSmrg	OUT_VERTEX_F(opacity);
2066428d7b3dSmrg}
2067428d7b3dSmrg
2068428d7b3dSmrgsse2 fastcall static void
2069428d7b3dSmrgemit_span_boxes(const struct sna_composite_spans_op *op,
2070428d7b3dSmrg		const struct sna_opacity_box *b, int nbox,
2071428d7b3dSmrg		float *v)
2072428d7b3dSmrg{
2073428d7b3dSmrg	do {
2074428d7b3dSmrg		v = vemit_vertex(v, &op->base, b->box.x2, b->box.y2);
2075428d7b3dSmrg		*v++ = b->alpha;
2076428d7b3dSmrg
2077428d7b3dSmrg		v = vemit_vertex(v, &op->base, b->box.x1, b->box.y2);
2078428d7b3dSmrg		*v++ = b->alpha;
2079428d7b3dSmrg
2080428d7b3dSmrg		v = vemit_vertex(v, &op->base, b->box.x1, b->box.y1);
2081428d7b3dSmrg		*v++ = b->alpha;
2082428d7b3dSmrg
2083428d7b3dSmrg		b++;
2084428d7b3dSmrg	} while (--nbox);
2085428d7b3dSmrg}
2086428d7b3dSmrg
2087428d7b3dSmrgsse2 fastcall static void
2088428d7b3dSmrgemit_span_solid(struct sna *sna,
2089428d7b3dSmrg		const struct sna_composite_spans_op *op,
2090428d7b3dSmrg		const BoxRec *box,
2091428d7b3dSmrg		float opacity)
2092428d7b3dSmrg{
2093428d7b3dSmrg	float *v;
2094428d7b3dSmrg	union {
2095428d7b3dSmrg		struct sna_coordinate p;
2096428d7b3dSmrg		float f;
2097428d7b3dSmrg	} dst;
2098428d7b3dSmrg
2099428d7b3dSmrg	assert(op->base.floats_per_rect == 9);
2100428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
2101428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2102428d7b3dSmrg	sna->render.vertex_used += 3*3;
2103428d7b3dSmrg
2104428d7b3dSmrg	dst.p.x = box->x2;
2105428d7b3dSmrg	dst.p.y = box->y2;
2106428d7b3dSmrg	v[0] = dst.f;
2107428d7b3dSmrg
2108428d7b3dSmrg	dst.p.x = box->x1;
2109428d7b3dSmrg	v[3] = dst.f;
2110428d7b3dSmrg
2111428d7b3dSmrg	dst.p.y = box->y1;
2112428d7b3dSmrg	v[6] = dst.f;
2113428d7b3dSmrg
2114428d7b3dSmrg	v[7] = v[4] = v[1] = .5;
2115428d7b3dSmrg	v[8] = v[5] = v[2] = opacity;
2116428d7b3dSmrg}
2117428d7b3dSmrg
2118428d7b3dSmrgsse2 fastcall static void
2119428d7b3dSmrgemit_span_boxes_solid(const struct sna_composite_spans_op *op,
2120428d7b3dSmrg		      const struct sna_opacity_box *b,
2121428d7b3dSmrg		      int nbox, float *v)
2122428d7b3dSmrg{
2123428d7b3dSmrg	do {
2124428d7b3dSmrg		union {
2125428d7b3dSmrg			struct sna_coordinate p;
2126428d7b3dSmrg			float f;
2127428d7b3dSmrg		} dst;
2128428d7b3dSmrg
2129428d7b3dSmrg		dst.p.x = b->box.x2;
2130428d7b3dSmrg		dst.p.y = b->box.y2;
2131428d7b3dSmrg		v[0] = dst.f;
2132428d7b3dSmrg
2133428d7b3dSmrg		dst.p.x = b->box.x1;
2134428d7b3dSmrg		v[3] = dst.f;
2135428d7b3dSmrg
2136428d7b3dSmrg		dst.p.y = b->box.y1;
2137428d7b3dSmrg		v[6] = dst.f;
2138428d7b3dSmrg
2139428d7b3dSmrg		v[7] = v[4] = v[1] = .5;
2140428d7b3dSmrg		v[8] = v[5] = v[2] = b->alpha;
2141428d7b3dSmrg
2142428d7b3dSmrg		v += 9;
2143428d7b3dSmrg		b++;
2144428d7b3dSmrg	} while (--nbox);
2145428d7b3dSmrg}
2146428d7b3dSmrg
2147428d7b3dSmrgsse2 fastcall static void
2148428d7b3dSmrgemit_span_identity(struct sna *sna,
2149428d7b3dSmrg		    const struct sna_composite_spans_op *op,
2150428d7b3dSmrg		    const BoxRec *box,
2151428d7b3dSmrg		    float opacity)
2152428d7b3dSmrg{
2153428d7b3dSmrg	float *v;
2154428d7b3dSmrg	union {
2155428d7b3dSmrg		struct sna_coordinate p;
2156428d7b3dSmrg		float f;
2157428d7b3dSmrg	} dst;
2158428d7b3dSmrg
2159428d7b3dSmrg	float sx = op->base.src.scale[0];
2160428d7b3dSmrg	float sy = op->base.src.scale[1];
2161428d7b3dSmrg	int16_t tx = op->base.src.offset[0];
2162428d7b3dSmrg	int16_t ty = op->base.src.offset[1];
2163428d7b3dSmrg
2164428d7b3dSmrg	assert(op->base.floats_per_rect == 12);
2165428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
2166428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2167428d7b3dSmrg	sna->render.vertex_used += 3*4;
2168428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
2169428d7b3dSmrg
2170428d7b3dSmrg	dst.p.x = box->x2;
2171428d7b3dSmrg	dst.p.y = box->y2;
2172428d7b3dSmrg	v[0] = dst.f;
2173428d7b3dSmrg	v[1] = (box->x2 + tx) * sx;
2174428d7b3dSmrg	v[6] = v[2] = (box->y2 + ty) * sy;
2175428d7b3dSmrg
2176428d7b3dSmrg	dst.p.x = box->x1;
2177428d7b3dSmrg	v[4] = dst.f;
2178428d7b3dSmrg	v[9] = v[5] = (box->x1 + tx) * sx;
2179428d7b3dSmrg
2180428d7b3dSmrg	dst.p.y = box->y1;
2181428d7b3dSmrg	v[8] = dst.f;
2182428d7b3dSmrg	v[10] = (box->y1 + ty) * sy;
2183428d7b3dSmrg
2184428d7b3dSmrg	v[11] = v[7] = v[3] = opacity;
2185428d7b3dSmrg}
2186428d7b3dSmrg
2187428d7b3dSmrgsse2 fastcall static void
2188428d7b3dSmrgemit_span_boxes_identity(const struct sna_composite_spans_op *op,
2189428d7b3dSmrg			 const struct sna_opacity_box *b, int nbox,
2190428d7b3dSmrg			 float *v)
2191428d7b3dSmrg{
2192428d7b3dSmrg	do {
2193428d7b3dSmrg		union {
2194428d7b3dSmrg			struct sna_coordinate p;
2195428d7b3dSmrg			float f;
2196428d7b3dSmrg		} dst;
2197428d7b3dSmrg
2198428d7b3dSmrg		float sx = op->base.src.scale[0];
2199428d7b3dSmrg		float sy = op->base.src.scale[1];
2200428d7b3dSmrg		int16_t tx = op->base.src.offset[0];
2201428d7b3dSmrg		int16_t ty = op->base.src.offset[1];
2202428d7b3dSmrg
2203428d7b3dSmrg		dst.p.x = b->box.x2;
2204428d7b3dSmrg		dst.p.y = b->box.y2;
2205428d7b3dSmrg		v[0] = dst.f;
2206428d7b3dSmrg		v[1] = (b->box.x2 + tx) * sx;
2207428d7b3dSmrg		v[6] = v[2] = (b->box.y2 + ty) * sy;
2208428d7b3dSmrg
2209428d7b3dSmrg		dst.p.x = b->box.x1;
2210428d7b3dSmrg		v[4] = dst.f;
2211428d7b3dSmrg		v[9] = v[5] = (b->box.x1 + tx) * sx;
2212428d7b3dSmrg
2213428d7b3dSmrg		dst.p.y = b->box.y1;
2214428d7b3dSmrg		v[8] = dst.f;
2215428d7b3dSmrg		v[10] = (b->box.y1 + ty) * sy;
2216428d7b3dSmrg
2217428d7b3dSmrg		v[11] = v[7] = v[3] = b->alpha;
2218428d7b3dSmrg
2219428d7b3dSmrg		v += 12;
2220428d7b3dSmrg		b++;
2221428d7b3dSmrg	} while (--nbox);
2222428d7b3dSmrg}
2223428d7b3dSmrg
2224428d7b3dSmrgsse2 fastcall static void
2225428d7b3dSmrgemit_span_simple(struct sna *sna,
2226428d7b3dSmrg		 const struct sna_composite_spans_op *op,
2227428d7b3dSmrg		 const BoxRec *box,
2228428d7b3dSmrg		 float opacity)
2229428d7b3dSmrg{
2230428d7b3dSmrg	float *v;
2231428d7b3dSmrg	union {
2232428d7b3dSmrg		struct sna_coordinate p;
2233428d7b3dSmrg		float f;
2234428d7b3dSmrg	} dst;
2235428d7b3dSmrg
2236428d7b3dSmrg	float xx = op->base.src.transform->matrix[0][0];
2237428d7b3dSmrg	float x0 = op->base.src.transform->matrix[0][2];
2238428d7b3dSmrg	float yy = op->base.src.transform->matrix[1][1];
2239428d7b3dSmrg	float y0 = op->base.src.transform->matrix[1][2];
2240428d7b3dSmrg	float sx = op->base.src.scale[0];
2241428d7b3dSmrg	float sy = op->base.src.scale[1];
2242428d7b3dSmrg	int16_t tx = op->base.src.offset[0];
2243428d7b3dSmrg	int16_t ty = op->base.src.offset[1];
2244428d7b3dSmrg
2245428d7b3dSmrg	assert(op->base.floats_per_rect == 12);
2246428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
2247428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2248428d7b3dSmrg	sna->render.vertex_used += 3*4;
2249428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
2250428d7b3dSmrg
2251428d7b3dSmrg	dst.p.x = box->x2;
2252428d7b3dSmrg	dst.p.y = box->y2;
2253428d7b3dSmrg	v[0] = dst.f;
2254428d7b3dSmrg	v[1] = ((box->x2 + tx) * xx + x0) * sx;
2255428d7b3dSmrg	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
2256428d7b3dSmrg
2257428d7b3dSmrg	dst.p.x = box->x1;
2258428d7b3dSmrg	v[4] = dst.f;
2259428d7b3dSmrg	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
2260428d7b3dSmrg
2261428d7b3dSmrg	dst.p.y = box->y1;
2262428d7b3dSmrg	v[8] = dst.f;
2263428d7b3dSmrg	v[10] = ((box->y1 + ty) * yy + y0) * sy;
2264428d7b3dSmrg
2265428d7b3dSmrg	v[11] = v[7] = v[3] = opacity;
2266428d7b3dSmrg}
2267428d7b3dSmrg
2268428d7b3dSmrgsse2 fastcall static void
2269428d7b3dSmrgemit_span_boxes_simple(const struct sna_composite_spans_op *op,
2270428d7b3dSmrg		       const struct sna_opacity_box *b, int nbox,
2271428d7b3dSmrg		       float *v)
2272428d7b3dSmrg{
2273428d7b3dSmrg	float xx = op->base.src.transform->matrix[0][0];
2274428d7b3dSmrg	float x0 = op->base.src.transform->matrix[0][2];
2275428d7b3dSmrg	float yy = op->base.src.transform->matrix[1][1];
2276428d7b3dSmrg	float y0 = op->base.src.transform->matrix[1][2];
2277428d7b3dSmrg	float sx = op->base.src.scale[0];
2278428d7b3dSmrg	float sy = op->base.src.scale[1];
2279428d7b3dSmrg	int16_t tx = op->base.src.offset[0];
2280428d7b3dSmrg	int16_t ty = op->base.src.offset[1];
2281428d7b3dSmrg
2282428d7b3dSmrg	do {
2283428d7b3dSmrg		union {
2284428d7b3dSmrg			struct sna_coordinate p;
2285428d7b3dSmrg			float f;
2286428d7b3dSmrg		} dst;
2287428d7b3dSmrg
2288428d7b3dSmrg		dst.p.x = b->box.x2;
2289428d7b3dSmrg		dst.p.y = b->box.y2;
2290428d7b3dSmrg		v[0] = dst.f;
2291428d7b3dSmrg		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
2292428d7b3dSmrg		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
2293428d7b3dSmrg
2294428d7b3dSmrg		dst.p.x = b->box.x1;
2295428d7b3dSmrg		v[4] = dst.f;
2296428d7b3dSmrg		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
2297428d7b3dSmrg
2298428d7b3dSmrg		dst.p.y = b->box.y1;
2299428d7b3dSmrg		v[8] = dst.f;
2300428d7b3dSmrg		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
2301428d7b3dSmrg
2302428d7b3dSmrg		v[11] = v[7] = v[3] = b->alpha;
2303428d7b3dSmrg
2304428d7b3dSmrg		v += 12;
2305428d7b3dSmrg		b++;
2306428d7b3dSmrg	} while (--nbox);
2307428d7b3dSmrg}
2308428d7b3dSmrg
2309428d7b3dSmrgsse2 fastcall static void
2310428d7b3dSmrgemit_span_affine(struct sna *sna,
2311428d7b3dSmrg		  const struct sna_composite_spans_op *op,
2312428d7b3dSmrg		  const BoxRec *box,
2313428d7b3dSmrg		  float opacity)
2314428d7b3dSmrg{
2315428d7b3dSmrg	union {
2316428d7b3dSmrg		struct sna_coordinate p;
2317428d7b3dSmrg		float f;
2318428d7b3dSmrg	} dst;
2319428d7b3dSmrg	float *v;
2320428d7b3dSmrg
2321428d7b3dSmrg	assert(op->base.floats_per_rect == 12);
2322428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
2323428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2324428d7b3dSmrg	sna->render.vertex_used += 12;
2325428d7b3dSmrg
2326428d7b3dSmrg	dst.p.x = box->x2;
2327428d7b3dSmrg	dst.p.y = box->y2;
2328428d7b3dSmrg	v[0] = dst.f;
2329428d7b3dSmrg	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
2330428d7b3dSmrg				    op->base.src.offset[1] + box->y2,
2331428d7b3dSmrg				    op->base.src.transform,
2332428d7b3dSmrg				    op->base.src.scale,
2333428d7b3dSmrg				    &v[1], &v[2]);
2334428d7b3dSmrg
2335428d7b3dSmrg	dst.p.x = box->x1;
2336428d7b3dSmrg	v[4] = dst.f;
2337428d7b3dSmrg	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2338428d7b3dSmrg				    op->base.src.offset[1] + box->y2,
2339428d7b3dSmrg				    op->base.src.transform,
2340428d7b3dSmrg				    op->base.src.scale,
2341428d7b3dSmrg				    &v[5], &v[6]);
2342428d7b3dSmrg
2343428d7b3dSmrg	dst.p.y = box->y1;
2344428d7b3dSmrg	v[8] = dst.f;
2345428d7b3dSmrg	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2346428d7b3dSmrg				    op->base.src.offset[1] + box->y1,
2347428d7b3dSmrg				    op->base.src.transform,
2348428d7b3dSmrg				    op->base.src.scale,
2349428d7b3dSmrg				    &v[9], &v[10]);
2350428d7b3dSmrg
2351428d7b3dSmrg	v[11] = v[7] = v[3] = opacity;
2352428d7b3dSmrg}
2353428d7b3dSmrg
2354428d7b3dSmrgsse2 fastcall static void
2355428d7b3dSmrgemit_span_boxes_affine(const struct sna_composite_spans_op *op,
2356428d7b3dSmrg		       const struct sna_opacity_box *b, int nbox,
2357428d7b3dSmrg		       float *v)
2358428d7b3dSmrg{
2359428d7b3dSmrg	do {
2360428d7b3dSmrg		union {
2361428d7b3dSmrg			struct sna_coordinate p;
2362428d7b3dSmrg			float f;
2363428d7b3dSmrg		} dst;
2364428d7b3dSmrg
2365428d7b3dSmrg		dst.p.x = b->box.x2;
2366428d7b3dSmrg		dst.p.y = b->box.y2;
2367428d7b3dSmrg		v[0] = dst.f;
2368428d7b3dSmrg		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
2369428d7b3dSmrg					    op->base.src.offset[1] + b->box.y2,
2370428d7b3dSmrg					    op->base.src.transform,
2371428d7b3dSmrg					    op->base.src.scale,
2372428d7b3dSmrg					    &v[1], &v[2]);
2373428d7b3dSmrg
2374428d7b3dSmrg		dst.p.x = b->box.x1;
2375428d7b3dSmrg		v[4] = dst.f;
2376428d7b3dSmrg		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
2377428d7b3dSmrg					    op->base.src.offset[1] + b->box.y2,
2378428d7b3dSmrg					    op->base.src.transform,
2379428d7b3dSmrg					    op->base.src.scale,
2380428d7b3dSmrg					    &v[5], &v[6]);
2381428d7b3dSmrg
2382428d7b3dSmrg		dst.p.y = b->box.y1;
2383428d7b3dSmrg		v[8] = dst.f;
2384428d7b3dSmrg		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
2385428d7b3dSmrg					    op->base.src.offset[1] + b->box.y1,
2386428d7b3dSmrg					    op->base.src.transform,
2387428d7b3dSmrg					    op->base.src.scale,
2388428d7b3dSmrg					    &v[9], &v[10]);
2389428d7b3dSmrg
2390428d7b3dSmrg		v[11] = v[7] = v[3] = b->alpha;
2391428d7b3dSmrg
2392428d7b3dSmrg		v += 12;
2393428d7b3dSmrg		b++;
2394428d7b3dSmrg	} while (--nbox);
2395428d7b3dSmrg}
2396428d7b3dSmrg
2397428d7b3dSmrgsse2 fastcall static void
2398428d7b3dSmrgemit_span_linear(struct sna *sna,
2399428d7b3dSmrg		 const struct sna_composite_spans_op *op,
2400428d7b3dSmrg		 const BoxRec *box,
2401428d7b3dSmrg		 float opacity)
2402428d7b3dSmrg{
2403428d7b3dSmrg	union {
2404428d7b3dSmrg		struct sna_coordinate p;
2405428d7b3dSmrg		float f;
2406428d7b3dSmrg	} dst;
2407428d7b3dSmrg	float *v;
2408428d7b3dSmrg
2409428d7b3dSmrg	assert(op->base.floats_per_rect == 9);
2410428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
2411428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2412428d7b3dSmrg	sna->render.vertex_used += 9;
2413428d7b3dSmrg
2414428d7b3dSmrg	dst.p.x = box->x2;
2415428d7b3dSmrg	dst.p.y = box->y2;
2416428d7b3dSmrg	v[0] = dst.f;
2417428d7b3dSmrg	dst.p.x = box->x1;
2418428d7b3dSmrg	v[3] = dst.f;
2419428d7b3dSmrg	dst.p.y = box->y1;
2420428d7b3dSmrg	v[6] = dst.f;
2421428d7b3dSmrg
2422428d7b3dSmrg	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
2423428d7b3dSmrg	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
2424428d7b3dSmrg	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
2425428d7b3dSmrg
2426428d7b3dSmrg	v[8] = v[5] = v[2] = opacity;
2427428d7b3dSmrg}
2428428d7b3dSmrg
2429428d7b3dSmrgsse2 fastcall static void
2430428d7b3dSmrgemit_span_boxes_linear(const struct sna_composite_spans_op *op,
2431428d7b3dSmrg		       const struct sna_opacity_box *b, int nbox,
2432428d7b3dSmrg		       float *v)
2433428d7b3dSmrg{
2434428d7b3dSmrg	do {
2435428d7b3dSmrg		union {
2436428d7b3dSmrg			struct sna_coordinate p;
2437428d7b3dSmrg			float f;
2438428d7b3dSmrg		} dst;
2439428d7b3dSmrg
2440428d7b3dSmrg		dst.p.x = b->box.x2;
2441428d7b3dSmrg		dst.p.y = b->box.y2;
2442428d7b3dSmrg		v[0] = dst.f;
2443428d7b3dSmrg		dst.p.x = b->box.x1;
2444428d7b3dSmrg		v[3] = dst.f;
2445428d7b3dSmrg		dst.p.y = b->box.y1;
2446428d7b3dSmrg		v[6] = dst.f;
2447428d7b3dSmrg
2448428d7b3dSmrg		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
2449428d7b3dSmrg		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
2450428d7b3dSmrg		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
2451428d7b3dSmrg
2452428d7b3dSmrg		v[8] = v[5] = v[2] = b->alpha;
2453428d7b3dSmrg
2454428d7b3dSmrg		v += 9;
2455428d7b3dSmrg		b++;
2456428d7b3dSmrg	} while (--nbox);
2457428d7b3dSmrg}
2458428d7b3dSmrg
2459428d7b3dSmrg/* SSE4_2 */
2460428d7b3dSmrg#if defined(sse4_2)
2461428d7b3dSmrg
2462428d7b3dSmrgsse4_2 fastcall static void
2463428d7b3dSmrgemit_span_identity__sse4_2(struct sna *sna,
2464428d7b3dSmrg			   const struct sna_composite_spans_op *op,
2465428d7b3dSmrg			   const BoxRec *box,
2466428d7b3dSmrg			   float opacity)
2467428d7b3dSmrg{
2468428d7b3dSmrg	float *v;
2469428d7b3dSmrg	union {
2470428d7b3dSmrg		struct sna_coordinate p;
2471428d7b3dSmrg		float f;
2472428d7b3dSmrg	} dst;
2473428d7b3dSmrg
2474428d7b3dSmrg	float sx = op->base.src.scale[0];
2475428d7b3dSmrg	float sy = op->base.src.scale[1];
2476428d7b3dSmrg	int16_t tx = op->base.src.offset[0];
2477428d7b3dSmrg	int16_t ty = op->base.src.offset[1];
2478428d7b3dSmrg
2479428d7b3dSmrg	assert(op->base.floats_per_rect == 12);
2480428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
2481428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2482428d7b3dSmrg	sna->render.vertex_used += 3*4;
2483428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
2484428d7b3dSmrg
2485428d7b3dSmrg	dst.p.x = box->x2;
2486428d7b3dSmrg	dst.p.y = box->y2;
2487428d7b3dSmrg	v[0] = dst.f;
2488428d7b3dSmrg	v[1] = (box->x2 + tx) * sx;
2489428d7b3dSmrg	v[6] = v[2] = (box->y2 + ty) * sy;
2490428d7b3dSmrg
2491428d7b3dSmrg	dst.p.x = box->x1;
2492428d7b3dSmrg	v[4] = dst.f;
2493428d7b3dSmrg	v[9] = v[5] = (box->x1 + tx) * sx;
2494428d7b3dSmrg
2495428d7b3dSmrg	dst.p.y = box->y1;
2496428d7b3dSmrg	v[8] = dst.f;
2497428d7b3dSmrg	v[10] = (box->y1 + ty) * sy;
2498428d7b3dSmrg
2499428d7b3dSmrg	v[11] = v[7] = v[3] = opacity;
2500428d7b3dSmrg}
2501428d7b3dSmrg
2502428d7b3dSmrgsse4_2 fastcall static void
2503428d7b3dSmrgemit_span_boxes_identity__sse4_2(const struct sna_composite_spans_op *op,
2504428d7b3dSmrg				 const struct sna_opacity_box *b, int nbox,
2505428d7b3dSmrg				 float *v)
2506428d7b3dSmrg{
2507428d7b3dSmrg	do {
2508428d7b3dSmrg		union {
2509428d7b3dSmrg			struct sna_coordinate p;
2510428d7b3dSmrg			float f;
2511428d7b3dSmrg		} dst;
2512428d7b3dSmrg
2513428d7b3dSmrg		float sx = op->base.src.scale[0];
2514428d7b3dSmrg		float sy = op->base.src.scale[1];
2515428d7b3dSmrg		int16_t tx = op->base.src.offset[0];
2516428d7b3dSmrg		int16_t ty = op->base.src.offset[1];
2517428d7b3dSmrg
2518428d7b3dSmrg		dst.p.x = b->box.x2;
2519428d7b3dSmrg		dst.p.y = b->box.y2;
2520428d7b3dSmrg		v[0] = dst.f;
2521428d7b3dSmrg		v[1] = (b->box.x2 + tx) * sx;
2522428d7b3dSmrg		v[6] = v[2] = (b->box.y2 + ty) * sy;
2523428d7b3dSmrg
2524428d7b3dSmrg		dst.p.x = b->box.x1;
2525428d7b3dSmrg		v[4] = dst.f;
2526428d7b3dSmrg		v[9] = v[5] = (b->box.x1 + tx) * sx;
2527428d7b3dSmrg
2528428d7b3dSmrg		dst.p.y = b->box.y1;
2529428d7b3dSmrg		v[8] = dst.f;
2530428d7b3dSmrg		v[10] = (b->box.y1 + ty) * sy;
2531428d7b3dSmrg
2532428d7b3dSmrg		v[11] = v[7] = v[3] = b->alpha;
2533428d7b3dSmrg
2534428d7b3dSmrg		v += 12;
2535428d7b3dSmrg		b++;
2536428d7b3dSmrg	} while (--nbox);
2537428d7b3dSmrg}
2538428d7b3dSmrg
2539428d7b3dSmrgsse4_2 fastcall static void
2540428d7b3dSmrgemit_span_simple__sse4_2(struct sna *sna,
2541428d7b3dSmrg			 const struct sna_composite_spans_op *op,
2542428d7b3dSmrg			 const BoxRec *box,
2543428d7b3dSmrg			 float opacity)
2544428d7b3dSmrg{
2545428d7b3dSmrg	float *v;
2546428d7b3dSmrg	union {
2547428d7b3dSmrg		struct sna_coordinate p;
2548428d7b3dSmrg		float f;
2549428d7b3dSmrg	} dst;
2550428d7b3dSmrg
2551428d7b3dSmrg	float xx = op->base.src.transform->matrix[0][0];
2552428d7b3dSmrg	float x0 = op->base.src.transform->matrix[0][2];
2553428d7b3dSmrg	float yy = op->base.src.transform->matrix[1][1];
2554428d7b3dSmrg	float y0 = op->base.src.transform->matrix[1][2];
2555428d7b3dSmrg	float sx = op->base.src.scale[0];
2556428d7b3dSmrg	float sy = op->base.src.scale[1];
2557428d7b3dSmrg	int16_t tx = op->base.src.offset[0];
2558428d7b3dSmrg	int16_t ty = op->base.src.offset[1];
2559428d7b3dSmrg
2560428d7b3dSmrg	assert(op->base.floats_per_rect == 12);
2561428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
2562428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2563428d7b3dSmrg	sna->render.vertex_used += 3*4;
2564428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
2565428d7b3dSmrg
2566428d7b3dSmrg	dst.p.x = box->x2;
2567428d7b3dSmrg	dst.p.y = box->y2;
2568428d7b3dSmrg	v[0] = dst.f;
2569428d7b3dSmrg	v[1] = ((box->x2 + tx) * xx + x0) * sx;
2570428d7b3dSmrg	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
2571428d7b3dSmrg
2572428d7b3dSmrg	dst.p.x = box->x1;
2573428d7b3dSmrg	v[4] = dst.f;
2574428d7b3dSmrg	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
2575428d7b3dSmrg
2576428d7b3dSmrg	dst.p.y = box->y1;
2577428d7b3dSmrg	v[8] = dst.f;
2578428d7b3dSmrg	v[10] = ((box->y1 + ty) * yy + y0) * sy;
2579428d7b3dSmrg
2580428d7b3dSmrg	v[11] = v[7] = v[3] = opacity;
2581428d7b3dSmrg}
2582428d7b3dSmrg
2583428d7b3dSmrgsse4_2 fastcall static void
2584428d7b3dSmrgemit_span_boxes_simple__sse4_2(const struct sna_composite_spans_op *op,
2585428d7b3dSmrg			       const struct sna_opacity_box *b, int nbox,
2586428d7b3dSmrg			       float *v)
2587428d7b3dSmrg{
2588428d7b3dSmrg	float xx = op->base.src.transform->matrix[0][0];
2589428d7b3dSmrg	float x0 = op->base.src.transform->matrix[0][2];
2590428d7b3dSmrg	float yy = op->base.src.transform->matrix[1][1];
2591428d7b3dSmrg	float y0 = op->base.src.transform->matrix[1][2];
2592428d7b3dSmrg	float sx = op->base.src.scale[0];
2593428d7b3dSmrg	float sy = op->base.src.scale[1];
2594428d7b3dSmrg	int16_t tx = op->base.src.offset[0];
2595428d7b3dSmrg	int16_t ty = op->base.src.offset[1];
2596428d7b3dSmrg
2597428d7b3dSmrg	do {
2598428d7b3dSmrg		union {
2599428d7b3dSmrg			struct sna_coordinate p;
2600428d7b3dSmrg			float f;
2601428d7b3dSmrg		} dst;
2602428d7b3dSmrg
2603428d7b3dSmrg		dst.p.x = b->box.x2;
2604428d7b3dSmrg		dst.p.y = b->box.y2;
2605428d7b3dSmrg		v[0] = dst.f;
2606428d7b3dSmrg		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
2607428d7b3dSmrg		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
2608428d7b3dSmrg
2609428d7b3dSmrg		dst.p.x = b->box.x1;
2610428d7b3dSmrg		v[4] = dst.f;
2611428d7b3dSmrg		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
2612428d7b3dSmrg
2613428d7b3dSmrg		dst.p.y = b->box.y1;
2614428d7b3dSmrg		v[8] = dst.f;
2615428d7b3dSmrg		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
2616428d7b3dSmrg
2617428d7b3dSmrg		v[11] = v[7] = v[3] = b->alpha;
2618428d7b3dSmrg
2619428d7b3dSmrg		v += 12;
2620428d7b3dSmrg		b++;
2621428d7b3dSmrg	} while (--nbox);
2622428d7b3dSmrg}
2623428d7b3dSmrg
2624428d7b3dSmrgsse4_2 fastcall static void
2625428d7b3dSmrgemit_span_affine__sse4_2(struct sna *sna,
2626428d7b3dSmrg			 const struct sna_composite_spans_op *op,
2627428d7b3dSmrg			 const BoxRec *box,
2628428d7b3dSmrg			 float opacity)
2629428d7b3dSmrg{
2630428d7b3dSmrg	union {
2631428d7b3dSmrg		struct sna_coordinate p;
2632428d7b3dSmrg		float f;
2633428d7b3dSmrg	} dst;
2634428d7b3dSmrg	float *v;
2635428d7b3dSmrg
2636428d7b3dSmrg	assert(op->base.floats_per_rect == 12);
2637428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
2638428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2639428d7b3dSmrg	sna->render.vertex_used += 12;
2640428d7b3dSmrg
2641428d7b3dSmrg	dst.p.x = box->x2;
2642428d7b3dSmrg	dst.p.y = box->y2;
2643428d7b3dSmrg	v[0] = dst.f;
2644428d7b3dSmrg	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
2645428d7b3dSmrg				    op->base.src.offset[1] + box->y2,
2646428d7b3dSmrg				    op->base.src.transform,
2647428d7b3dSmrg				    op->base.src.scale,
2648428d7b3dSmrg				    &v[1], &v[2]);
2649428d7b3dSmrg
2650428d7b3dSmrg	dst.p.x = box->x1;
2651428d7b3dSmrg	v[4] = dst.f;
2652428d7b3dSmrg	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2653428d7b3dSmrg				    op->base.src.offset[1] + box->y2,
2654428d7b3dSmrg				    op->base.src.transform,
2655428d7b3dSmrg				    op->base.src.scale,
2656428d7b3dSmrg				    &v[5], &v[6]);
2657428d7b3dSmrg
2658428d7b3dSmrg	dst.p.y = box->y1;
2659428d7b3dSmrg	v[8] = dst.f;
2660428d7b3dSmrg	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2661428d7b3dSmrg				    op->base.src.offset[1] + box->y1,
2662428d7b3dSmrg				    op->base.src.transform,
2663428d7b3dSmrg				    op->base.src.scale,
2664428d7b3dSmrg				    &v[9], &v[10]);
2665428d7b3dSmrg
2666428d7b3dSmrg	v[11] = v[7] = v[3] = opacity;
2667428d7b3dSmrg}
2668428d7b3dSmrg
2669428d7b3dSmrgsse4_2 fastcall static void
2670428d7b3dSmrgemit_span_boxes_affine__sse4_2(const struct sna_composite_spans_op *op,
2671428d7b3dSmrg			       const struct sna_opacity_box *b, int nbox,
2672428d7b3dSmrg			       float *v)
2673428d7b3dSmrg{
2674428d7b3dSmrg	do {
2675428d7b3dSmrg		union {
2676428d7b3dSmrg			struct sna_coordinate p;
2677428d7b3dSmrg			float f;
2678428d7b3dSmrg		} dst;
2679428d7b3dSmrg
2680428d7b3dSmrg		dst.p.x = b->box.x2;
2681428d7b3dSmrg		dst.p.y = b->box.y2;
2682428d7b3dSmrg		v[0] = dst.f;
2683428d7b3dSmrg		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
2684428d7b3dSmrg					    op->base.src.offset[1] + b->box.y2,
2685428d7b3dSmrg					    op->base.src.transform,
2686428d7b3dSmrg					    op->base.src.scale,
2687428d7b3dSmrg					    &v[1], &v[2]);
2688428d7b3dSmrg
2689428d7b3dSmrg		dst.p.x = b->box.x1;
2690428d7b3dSmrg		v[4] = dst.f;
2691428d7b3dSmrg		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
2692428d7b3dSmrg					    op->base.src.offset[1] + b->box.y2,
2693428d7b3dSmrg					    op->base.src.transform,
2694428d7b3dSmrg					    op->base.src.scale,
2695428d7b3dSmrg					    &v[5], &v[6]);
2696428d7b3dSmrg
2697428d7b3dSmrg		dst.p.y = b->box.y1;
2698428d7b3dSmrg		v[8] = dst.f;
2699428d7b3dSmrg		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
2700428d7b3dSmrg					    op->base.src.offset[1] + b->box.y1,
2701428d7b3dSmrg					    op->base.src.transform,
2702428d7b3dSmrg					    op->base.src.scale,
2703428d7b3dSmrg					    &v[9], &v[10]);
2704428d7b3dSmrg
2705428d7b3dSmrg		v[11] = v[7] = v[3] = b->alpha;
2706428d7b3dSmrg
2707428d7b3dSmrg		v += 12;
2708428d7b3dSmrg		b++;
2709428d7b3dSmrg	} while (--nbox);
2710428d7b3dSmrg}
2711428d7b3dSmrg
2712428d7b3dSmrgsse4_2 fastcall static void
2713428d7b3dSmrgemit_span_linear__sse4_2(struct sna *sna,
2714428d7b3dSmrg			 const struct sna_composite_spans_op *op,
2715428d7b3dSmrg			 const BoxRec *box,
2716428d7b3dSmrg			 float opacity)
2717428d7b3dSmrg{
2718428d7b3dSmrg	union {
2719428d7b3dSmrg		struct sna_coordinate p;
2720428d7b3dSmrg		float f;
2721428d7b3dSmrg	} dst;
2722428d7b3dSmrg	float *v;
2723428d7b3dSmrg
2724428d7b3dSmrg	assert(op->base.floats_per_rect == 9);
2725428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
2726428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2727428d7b3dSmrg	sna->render.vertex_used += 9;
2728428d7b3dSmrg
2729428d7b3dSmrg	dst.p.x = box->x2;
2730428d7b3dSmrg	dst.p.y = box->y2;
2731428d7b3dSmrg	v[0] = dst.f;
2732428d7b3dSmrg	dst.p.x = box->x1;
2733428d7b3dSmrg	v[3] = dst.f;
2734428d7b3dSmrg	dst.p.y = box->y1;
2735428d7b3dSmrg	v[6] = dst.f;
2736428d7b3dSmrg
2737428d7b3dSmrg	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
2738428d7b3dSmrg	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
2739428d7b3dSmrg	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
2740428d7b3dSmrg
2741428d7b3dSmrg	v[8] = v[5] = v[2] = opacity;
2742428d7b3dSmrg}
2743428d7b3dSmrg
2744428d7b3dSmrgsse4_2 fastcall static void
2745428d7b3dSmrgemit_span_boxes_linear__sse4_2(const struct sna_composite_spans_op *op,
2746428d7b3dSmrg			       const struct sna_opacity_box *b, int nbox,
2747428d7b3dSmrg			       float *v)
2748428d7b3dSmrg{
2749428d7b3dSmrg	do {
2750428d7b3dSmrg		union {
2751428d7b3dSmrg			struct sna_coordinate p;
2752428d7b3dSmrg			float f;
2753428d7b3dSmrg		} dst;
2754428d7b3dSmrg
2755428d7b3dSmrg		dst.p.x = b->box.x2;
2756428d7b3dSmrg		dst.p.y = b->box.y2;
2757428d7b3dSmrg		v[0] = dst.f;
2758428d7b3dSmrg		dst.p.x = b->box.x1;
2759428d7b3dSmrg		v[3] = dst.f;
2760428d7b3dSmrg		dst.p.y = b->box.y1;
2761428d7b3dSmrg		v[6] = dst.f;
2762428d7b3dSmrg
2763428d7b3dSmrg		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
2764428d7b3dSmrg		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
2765428d7b3dSmrg		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
2766428d7b3dSmrg
2767428d7b3dSmrg		v[8] = v[5] = v[2] = b->alpha;
2768428d7b3dSmrg
2769428d7b3dSmrg		v += 9;
2770428d7b3dSmrg		b++;
2771428d7b3dSmrg	} while (--nbox);
2772428d7b3dSmrg}
2773428d7b3dSmrg
2774428d7b3dSmrg#endif
2775428d7b3dSmrg
2776428d7b3dSmrg/* AVX2 */
2777428d7b3dSmrg#if defined(avx2)
2778428d7b3dSmrg
2779428d7b3dSmrgavx2 fastcall static void
2780428d7b3dSmrgemit_span_identity__avx2(struct sna *sna,
2781428d7b3dSmrg			 const struct sna_composite_spans_op *op,
2782428d7b3dSmrg			 const BoxRec *box,
2783428d7b3dSmrg			 float opacity)
2784428d7b3dSmrg{
2785428d7b3dSmrg	float *v;
2786428d7b3dSmrg	union {
2787428d7b3dSmrg		struct sna_coordinate p;
2788428d7b3dSmrg		float f;
2789428d7b3dSmrg	} dst;
2790428d7b3dSmrg
2791428d7b3dSmrg	float sx = op->base.src.scale[0];
2792428d7b3dSmrg	float sy = op->base.src.scale[1];
2793428d7b3dSmrg	int16_t tx = op->base.src.offset[0];
2794428d7b3dSmrg	int16_t ty = op->base.src.offset[1];
2795428d7b3dSmrg
2796428d7b3dSmrg	assert(op->base.floats_per_rect == 12);
2797428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
2798428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2799428d7b3dSmrg	sna->render.vertex_used += 3*4;
2800428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
2801428d7b3dSmrg
2802428d7b3dSmrg	dst.p.x = box->x2;
2803428d7b3dSmrg	dst.p.y = box->y2;
2804428d7b3dSmrg	v[0] = dst.f;
2805428d7b3dSmrg	v[1] = (box->x2 + tx) * sx;
2806428d7b3dSmrg	v[6] = v[2] = (box->y2 + ty) * sy;
2807428d7b3dSmrg
2808428d7b3dSmrg	dst.p.x = box->x1;
2809428d7b3dSmrg	v[4] = dst.f;
2810428d7b3dSmrg	v[9] = v[5] = (box->x1 + tx) * sx;
2811428d7b3dSmrg
2812428d7b3dSmrg	dst.p.y = box->y1;
2813428d7b3dSmrg	v[8] = dst.f;
2814428d7b3dSmrg	v[10] = (box->y1 + ty) * sy;
2815428d7b3dSmrg
2816428d7b3dSmrg	v[11] = v[7] = v[3] = opacity;
2817428d7b3dSmrg}
2818428d7b3dSmrg
2819428d7b3dSmrgavx2 fastcall static void
2820428d7b3dSmrgemit_span_boxes_identity__avx2(const struct sna_composite_spans_op *op,
2821428d7b3dSmrg			       const struct sna_opacity_box *b, int nbox,
2822428d7b3dSmrg			       float *v)
2823428d7b3dSmrg{
2824428d7b3dSmrg	do {
2825428d7b3dSmrg		union {
2826428d7b3dSmrg			struct sna_coordinate p;
2827428d7b3dSmrg			float f;
2828428d7b3dSmrg		} dst;
2829428d7b3dSmrg
2830428d7b3dSmrg		float sx = op->base.src.scale[0];
2831428d7b3dSmrg		float sy = op->base.src.scale[1];
2832428d7b3dSmrg		int16_t tx = op->base.src.offset[0];
2833428d7b3dSmrg		int16_t ty = op->base.src.offset[1];
2834428d7b3dSmrg
2835428d7b3dSmrg		dst.p.x = b->box.x2;
2836428d7b3dSmrg		dst.p.y = b->box.y2;
2837428d7b3dSmrg		v[0] = dst.f;
2838428d7b3dSmrg		v[1] = (b->box.x2 + tx) * sx;
2839428d7b3dSmrg		v[6] = v[2] = (b->box.y2 + ty) * sy;
2840428d7b3dSmrg
2841428d7b3dSmrg		dst.p.x = b->box.x1;
2842428d7b3dSmrg		v[4] = dst.f;
2843428d7b3dSmrg		v[9] = v[5] = (b->box.x1 + tx) * sx;
2844428d7b3dSmrg
2845428d7b3dSmrg		dst.p.y = b->box.y1;
2846428d7b3dSmrg		v[8] = dst.f;
2847428d7b3dSmrg		v[10] = (b->box.y1 + ty) * sy;
2848428d7b3dSmrg
2849428d7b3dSmrg		v[11] = v[7] = v[3] = b->alpha;
2850428d7b3dSmrg
2851428d7b3dSmrg		v += 12;
2852428d7b3dSmrg		b++;
2853428d7b3dSmrg	} while (--nbox);
2854428d7b3dSmrg}
2855428d7b3dSmrg
2856428d7b3dSmrgavx2 fastcall static void
2857428d7b3dSmrgemit_span_simple__avx2(struct sna *sna,
2858428d7b3dSmrg		       const struct sna_composite_spans_op *op,
2859428d7b3dSmrg		       const BoxRec *box,
2860428d7b3dSmrg		       float opacity)
2861428d7b3dSmrg{
2862428d7b3dSmrg	float *v;
2863428d7b3dSmrg	union {
2864428d7b3dSmrg		struct sna_coordinate p;
2865428d7b3dSmrg		float f;
2866428d7b3dSmrg	} dst;
2867428d7b3dSmrg
2868428d7b3dSmrg	float xx = op->base.src.transform->matrix[0][0];
2869428d7b3dSmrg	float x0 = op->base.src.transform->matrix[0][2];
2870428d7b3dSmrg	float yy = op->base.src.transform->matrix[1][1];
2871428d7b3dSmrg	float y0 = op->base.src.transform->matrix[1][2];
2872428d7b3dSmrg	float sx = op->base.src.scale[0];
2873428d7b3dSmrg	float sy = op->base.src.scale[1];
2874428d7b3dSmrg	int16_t tx = op->base.src.offset[0];
2875428d7b3dSmrg	int16_t ty = op->base.src.offset[1];
2876428d7b3dSmrg
2877428d7b3dSmrg	assert(op->base.floats_per_rect == 12);
2878428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
2879428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2880428d7b3dSmrg	sna->render.vertex_used += 3*4;
2881428d7b3dSmrg	assert(sna->render.vertex_used <= sna->render.vertex_size);
2882428d7b3dSmrg
2883428d7b3dSmrg	dst.p.x = box->x2;
2884428d7b3dSmrg	dst.p.y = box->y2;
2885428d7b3dSmrg	v[0] = dst.f;
2886428d7b3dSmrg	v[1] = ((box->x2 + tx) * xx + x0) * sx;
2887428d7b3dSmrg	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
2888428d7b3dSmrg
2889428d7b3dSmrg	dst.p.x = box->x1;
2890428d7b3dSmrg	v[4] = dst.f;
2891428d7b3dSmrg	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
2892428d7b3dSmrg
2893428d7b3dSmrg	dst.p.y = box->y1;
2894428d7b3dSmrg	v[8] = dst.f;
2895428d7b3dSmrg	v[10] = ((box->y1 + ty) * yy + y0) * sy;
2896428d7b3dSmrg
2897428d7b3dSmrg	v[11] = v[7] = v[3] = opacity;
2898428d7b3dSmrg}
2899428d7b3dSmrg
2900428d7b3dSmrgavx2 fastcall static void
2901428d7b3dSmrgemit_span_boxes_simple__avx2(const struct sna_composite_spans_op *op,
2902428d7b3dSmrg			     const struct sna_opacity_box *b, int nbox,
2903428d7b3dSmrg			     float *v)
2904428d7b3dSmrg{
2905428d7b3dSmrg	float xx = op->base.src.transform->matrix[0][0];
2906428d7b3dSmrg	float x0 = op->base.src.transform->matrix[0][2];
2907428d7b3dSmrg	float yy = op->base.src.transform->matrix[1][1];
2908428d7b3dSmrg	float y0 = op->base.src.transform->matrix[1][2];
2909428d7b3dSmrg	float sx = op->base.src.scale[0];
2910428d7b3dSmrg	float sy = op->base.src.scale[1];
2911428d7b3dSmrg	int16_t tx = op->base.src.offset[0];
2912428d7b3dSmrg	int16_t ty = op->base.src.offset[1];
2913428d7b3dSmrg
2914428d7b3dSmrg	do {
2915428d7b3dSmrg		union {
2916428d7b3dSmrg			struct sna_coordinate p;
2917428d7b3dSmrg			float f;
2918428d7b3dSmrg		} dst;
2919428d7b3dSmrg
2920428d7b3dSmrg		dst.p.x = b->box.x2;
2921428d7b3dSmrg		dst.p.y = b->box.y2;
2922428d7b3dSmrg		v[0] = dst.f;
2923428d7b3dSmrg		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
2924428d7b3dSmrg		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
2925428d7b3dSmrg
2926428d7b3dSmrg		dst.p.x = b->box.x1;
2927428d7b3dSmrg		v[4] = dst.f;
2928428d7b3dSmrg		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
2929428d7b3dSmrg
2930428d7b3dSmrg		dst.p.y = b->box.y1;
2931428d7b3dSmrg		v[8] = dst.f;
2932428d7b3dSmrg		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
2933428d7b3dSmrg
2934428d7b3dSmrg		v[11] = v[7] = v[3] = b->alpha;
2935428d7b3dSmrg
2936428d7b3dSmrg		v += 12;
2937428d7b3dSmrg		b++;
2938428d7b3dSmrg	} while (--nbox);
2939428d7b3dSmrg}
2940428d7b3dSmrg
2941428d7b3dSmrgavx2 fastcall static void
2942428d7b3dSmrgemit_span_affine__avx2(struct sna *sna,
2943428d7b3dSmrg		       const struct sna_composite_spans_op *op,
2944428d7b3dSmrg		       const BoxRec *box,
2945428d7b3dSmrg		       float opacity)
2946428d7b3dSmrg{
2947428d7b3dSmrg	union {
2948428d7b3dSmrg		struct sna_coordinate p;
2949428d7b3dSmrg		float f;
2950428d7b3dSmrg	} dst;
2951428d7b3dSmrg	float *v;
2952428d7b3dSmrg
2953428d7b3dSmrg	assert(op->base.floats_per_rect == 12);
2954428d7b3dSmrg	assert((sna->render.vertex_used % 4) == 0);
2955428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
2956428d7b3dSmrg	sna->render.vertex_used += 12;
2957428d7b3dSmrg
2958428d7b3dSmrg	dst.p.x = box->x2;
2959428d7b3dSmrg	dst.p.y = box->y2;
2960428d7b3dSmrg	v[0] = dst.f;
2961428d7b3dSmrg	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
2962428d7b3dSmrg				    op->base.src.offset[1] + box->y2,
2963428d7b3dSmrg				    op->base.src.transform,
2964428d7b3dSmrg				    op->base.src.scale,
2965428d7b3dSmrg				    &v[1], &v[2]);
2966428d7b3dSmrg
2967428d7b3dSmrg	dst.p.x = box->x1;
2968428d7b3dSmrg	v[4] = dst.f;
2969428d7b3dSmrg	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2970428d7b3dSmrg				    op->base.src.offset[1] + box->y2,
2971428d7b3dSmrg				    op->base.src.transform,
2972428d7b3dSmrg				    op->base.src.scale,
2973428d7b3dSmrg				    &v[5], &v[6]);
2974428d7b3dSmrg
2975428d7b3dSmrg	dst.p.y = box->y1;
2976428d7b3dSmrg	v[8] = dst.f;
2977428d7b3dSmrg	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
2978428d7b3dSmrg				    op->base.src.offset[1] + box->y1,
2979428d7b3dSmrg				    op->base.src.transform,
2980428d7b3dSmrg				    op->base.src.scale,
2981428d7b3dSmrg				    &v[9], &v[10]);
2982428d7b3dSmrg
2983428d7b3dSmrg	v[11] = v[7] = v[3] = opacity;
2984428d7b3dSmrg}
2985428d7b3dSmrg
2986428d7b3dSmrgavx2 fastcall static void
2987428d7b3dSmrgemit_span_boxes_affine__avx2(const struct sna_composite_spans_op *op,
2988428d7b3dSmrg			     const struct sna_opacity_box *b, int nbox,
2989428d7b3dSmrg			     float *v)
2990428d7b3dSmrg{
2991428d7b3dSmrg	do {
2992428d7b3dSmrg		union {
2993428d7b3dSmrg			struct sna_coordinate p;
2994428d7b3dSmrg			float f;
2995428d7b3dSmrg		} dst;
2996428d7b3dSmrg
2997428d7b3dSmrg		dst.p.x = b->box.x2;
2998428d7b3dSmrg		dst.p.y = b->box.y2;
2999428d7b3dSmrg		v[0] = dst.f;
3000428d7b3dSmrg		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
3001428d7b3dSmrg					    op->base.src.offset[1] + b->box.y2,
3002428d7b3dSmrg					    op->base.src.transform,
3003428d7b3dSmrg					    op->base.src.scale,
3004428d7b3dSmrg					    &v[1], &v[2]);
3005428d7b3dSmrg
3006428d7b3dSmrg		dst.p.x = b->box.x1;
3007428d7b3dSmrg		v[4] = dst.f;
3008428d7b3dSmrg		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
3009428d7b3dSmrg					    op->base.src.offset[1] + b->box.y2,
3010428d7b3dSmrg					    op->base.src.transform,
3011428d7b3dSmrg					    op->base.src.scale,
3012428d7b3dSmrg					    &v[5], &v[6]);
3013428d7b3dSmrg
3014428d7b3dSmrg		dst.p.y = b->box.y1;
3015428d7b3dSmrg		v[8] = dst.f;
3016428d7b3dSmrg		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
3017428d7b3dSmrg					    op->base.src.offset[1] + b->box.y1,
3018428d7b3dSmrg					    op->base.src.transform,
3019428d7b3dSmrg					    op->base.src.scale,
3020428d7b3dSmrg					    &v[9], &v[10]);
3021428d7b3dSmrg
3022428d7b3dSmrg		v[11] = v[7] = v[3] = b->alpha;
3023428d7b3dSmrg
3024428d7b3dSmrg		v += 12;
3025428d7b3dSmrg		b++;
3026428d7b3dSmrg	} while (--nbox);
3027428d7b3dSmrg}
3028428d7b3dSmrg
3029428d7b3dSmrgavx2 fastcall static void
3030428d7b3dSmrgemit_span_linear__avx2(struct sna *sna,
3031428d7b3dSmrg		       const struct sna_composite_spans_op *op,
3032428d7b3dSmrg		       const BoxRec *box,
3033428d7b3dSmrg		       float opacity)
3034428d7b3dSmrg{
3035428d7b3dSmrg	union {
3036428d7b3dSmrg		struct sna_coordinate p;
3037428d7b3dSmrg		float f;
3038428d7b3dSmrg	} dst;
3039428d7b3dSmrg	float *v;
3040428d7b3dSmrg
3041428d7b3dSmrg	assert(op->base.floats_per_rect == 9);
3042428d7b3dSmrg	assert((sna->render.vertex_used % 3) == 0);
3043428d7b3dSmrg	v = sna->render.vertices + sna->render.vertex_used;
3044428d7b3dSmrg	sna->render.vertex_used += 9;
3045428d7b3dSmrg
3046428d7b3dSmrg	dst.p.x = box->x2;
3047428d7b3dSmrg	dst.p.y = box->y2;
3048428d7b3dSmrg	v[0] = dst.f;
3049428d7b3dSmrg	dst.p.x = box->x1;
3050428d7b3dSmrg	v[3] = dst.f;
3051428d7b3dSmrg	dst.p.y = box->y1;
3052428d7b3dSmrg	v[6] = dst.f;
3053428d7b3dSmrg
3054428d7b3dSmrg	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
3055428d7b3dSmrg	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
3056428d7b3dSmrg	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
3057428d7b3dSmrg
3058428d7b3dSmrg	v[8] = v[5] = v[2] = opacity;
3059428d7b3dSmrg}
3060428d7b3dSmrg
3061428d7b3dSmrgavx2 fastcall static void
3062428d7b3dSmrgemit_span_boxes_linear__avx2(const struct sna_composite_spans_op *op,
3063428d7b3dSmrg			     const struct sna_opacity_box *b, int nbox,
3064428d7b3dSmrg			     float *v)
3065428d7b3dSmrg{
3066428d7b3dSmrg	do {
3067428d7b3dSmrg		union {
3068428d7b3dSmrg			struct sna_coordinate p;
3069428d7b3dSmrg			float f;
3070428d7b3dSmrg		} dst;
3071428d7b3dSmrg
3072428d7b3dSmrg		dst.p.x = b->box.x2;
3073428d7b3dSmrg		dst.p.y = b->box.y2;
3074428d7b3dSmrg		v[0] = dst.f;
3075428d7b3dSmrg		dst.p.x = b->box.x1;
3076428d7b3dSmrg		v[3] = dst.f;
3077428d7b3dSmrg		dst.p.y = b->box.y1;
3078428d7b3dSmrg		v[6] = dst.f;
3079428d7b3dSmrg
3080428d7b3dSmrg		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
3081428d7b3dSmrg		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
3082428d7b3dSmrg		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
3083428d7b3dSmrg
3084428d7b3dSmrg		v[8] = v[5] = v[2] = b->alpha;
3085428d7b3dSmrg
3086428d7b3dSmrg		v += 9;
3087428d7b3dSmrg		b++;
3088428d7b3dSmrg	} while (--nbox);
3089428d7b3dSmrg}
3090428d7b3dSmrg#endif
3091428d7b3dSmrg
3092428d7b3dSmrgunsigned gen4_choose_spans_emitter(struct sna *sna,
3093428d7b3dSmrg				   struct sna_composite_spans_op *tmp)
3094428d7b3dSmrg{
3095428d7b3dSmrg	unsigned vb;
3096428d7b3dSmrg
3097428d7b3dSmrg	if (tmp->base.src.is_solid) {
3098428d7b3dSmrg		DBG(("%s: solid source\n", __FUNCTION__));
3099428d7b3dSmrg		tmp->prim_emit = emit_span_solid;
3100428d7b3dSmrg		tmp->emit_boxes = emit_span_boxes_solid;
3101428d7b3dSmrg		tmp->base.floats_per_vertex = 3;
3102428d7b3dSmrg		vb = 1 << 2 | 1;
3103428d7b3dSmrg	} else if (tmp->base.src.is_linear) {
3104428d7b3dSmrg		DBG(("%s: linear source\n", __FUNCTION__));
3105428d7b3dSmrg#if defined(avx2)
3106428d7b3dSmrg		if (sna->cpu_features & AVX2) {
3107428d7b3dSmrg			tmp->prim_emit = emit_span_linear__avx2;
3108428d7b3dSmrg			tmp->emit_boxes = emit_span_boxes_linear__avx2;
3109428d7b3dSmrg		} else
3110428d7b3dSmrg#endif
3111428d7b3dSmrg#if defined(sse4_2)
3112428d7b3dSmrg		if (sna->cpu_features & SSE4_2) {
3113428d7b3dSmrg			tmp->prim_emit = emit_span_linear__sse4_2;
3114428d7b3dSmrg			tmp->emit_boxes = emit_span_boxes_linear__sse4_2;
3115428d7b3dSmrg		} else
3116428d7b3dSmrg#endif
3117428d7b3dSmrg		{
3118428d7b3dSmrg			tmp->prim_emit = emit_span_linear;
3119428d7b3dSmrg			tmp->emit_boxes = emit_span_boxes_linear;
3120428d7b3dSmrg		}
3121428d7b3dSmrg		tmp->base.floats_per_vertex = 3;
3122428d7b3dSmrg		vb = 1 << 2 | 1;
3123428d7b3dSmrg	} else if (tmp->base.src.transform == NULL) {
3124428d7b3dSmrg		DBG(("%s: identity transform\n", __FUNCTION__));
3125428d7b3dSmrg#if defined(avx2)
3126428d7b3dSmrg		if (sna->cpu_features & AVX2) {
3127428d7b3dSmrg			tmp->prim_emit = emit_span_identity__avx2;
3128428d7b3dSmrg			tmp->emit_boxes = emit_span_boxes_identity__avx2;
3129428d7b3dSmrg		} else
3130428d7b3dSmrg#endif
3131428d7b3dSmrg#if defined(sse4_2)
3132428d7b3dSmrg		if (sna->cpu_features & SSE4_2) {
3133428d7b3dSmrg			tmp->prim_emit = emit_span_identity__sse4_2;
3134428d7b3dSmrg			tmp->emit_boxes = emit_span_boxes_identity__sse4_2;
3135428d7b3dSmrg		} else
3136428d7b3dSmrg#endif
3137428d7b3dSmrg		{
3138428d7b3dSmrg			tmp->prim_emit = emit_span_identity;
3139428d7b3dSmrg			tmp->emit_boxes = emit_span_boxes_identity;
3140428d7b3dSmrg		}
3141428d7b3dSmrg		tmp->base.floats_per_vertex = 4;
3142428d7b3dSmrg		vb = 1 << 2 | 2;
3143428d7b3dSmrg	} else if (tmp->base.is_affine) {
3144428d7b3dSmrg		tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
3145428d7b3dSmrg		tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
3146428d7b3dSmrg		if (!sna_affine_transform_is_rotation(tmp->base.src.transform)) {
3147428d7b3dSmrg			DBG(("%s: simple (unrotated affine) transform\n", __FUNCTION__));
3148428d7b3dSmrg#if defined(avx2)
3149428d7b3dSmrg			if (sna->cpu_features & AVX2) {
3150428d7b3dSmrg				tmp->prim_emit = emit_span_simple__avx2;
3151428d7b3dSmrg				tmp->emit_boxes = emit_span_boxes_simple__avx2;
3152428d7b3dSmrg			} else
3153428d7b3dSmrg#endif
3154428d7b3dSmrg#if defined(sse4_2)
3155428d7b3dSmrg			if (sna->cpu_features & SSE4_2) {
3156428d7b3dSmrg				tmp->prim_emit = emit_span_simple__sse4_2;
3157428d7b3dSmrg				tmp->emit_boxes = emit_span_boxes_simple__sse4_2;
3158428d7b3dSmrg			} else
3159428d7b3dSmrg#endif
3160428d7b3dSmrg			{
3161428d7b3dSmrg				tmp->prim_emit = emit_span_simple;
3162428d7b3dSmrg				tmp->emit_boxes = emit_span_boxes_simple;
3163428d7b3dSmrg			}
3164428d7b3dSmrg		} else {
3165428d7b3dSmrg			DBG(("%s: affine transform\n", __FUNCTION__));
3166428d7b3dSmrg#if defined(avx2)
3167428d7b3dSmrg			if (sna->cpu_features & AVX2) {
3168428d7b3dSmrg				tmp->prim_emit = emit_span_affine__avx2;
3169428d7b3dSmrg				tmp->emit_boxes = emit_span_boxes_affine__avx2;
3170428d7b3dSmrg			} else
3171428d7b3dSmrg#endif
3172428d7b3dSmrg#if defined(sse4_2)
3173428d7b3dSmrg			if (sna->cpu_features & SSE4_2) {
3174428d7b3dSmrg				tmp->prim_emit = emit_span_affine__sse4_2;
3175428d7b3dSmrg				tmp->emit_boxes = emit_span_boxes_affine__sse4_2;
3176428d7b3dSmrg			} else
3177428d7b3dSmrg#endif
3178428d7b3dSmrg			{
3179428d7b3dSmrg				tmp->prim_emit = emit_span_affine;
3180428d7b3dSmrg				tmp->emit_boxes = emit_span_boxes_affine;
3181428d7b3dSmrg			}
3182428d7b3dSmrg		}
3183428d7b3dSmrg		tmp->base.floats_per_vertex = 4;
3184428d7b3dSmrg		vb = 1 << 2 | 2;
3185428d7b3dSmrg	} else {
3186428d7b3dSmrg		DBG(("%s: projective transform\n", __FUNCTION__));
3187428d7b3dSmrg		tmp->prim_emit = emit_span_primitive;
3188428d7b3dSmrg		tmp->emit_boxes = emit_span_boxes;
3189428d7b3dSmrg		tmp->base.floats_per_vertex = 5;
3190428d7b3dSmrg		vb = 1 << 2 | 3;
3191428d7b3dSmrg	}
3192428d7b3dSmrg	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
3193428d7b3dSmrg	return vb;
3194428d7b3dSmrg}
3195