1/*
2 * Copyright © 2006,2008,2011 Intel Corporation
3 * Copyright © 2007 Red Hat, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 *    Wang Zhenyu <zhenyu.z.wang@sna.com>
26 *    Eric Anholt <eric@anholt.net>
27 *    Carl Worth <cworth@redhat.com>
28 *    Keith Packard <keithp@keithp.com>
29 *    Chris Wilson <chris@chris-wilson.co.uk>
30 *
31 */
32
33#ifdef HAVE_CONFIG_H
34#include "config.h"
35#endif
36
37#include "sna.h"
38#include "sna_reg.h"
39#include "sna_render.h"
40#include "sna_render_inline.h"
41#include "sna_video.h"
42
43#include "brw/brw.h"
44#include "gen7_render.h"
45#include "gen4_common.h"
46#include "gen4_source.h"
47#include "gen4_vertex.h"
48#include "gen6_common.h"
49
50#define ALWAYS_INVALIDATE 0
51#define ALWAYS_FLUSH 0
52#define ALWAYS_STALL 0
53
54#define NO_COMPOSITE 0
55#define NO_COMPOSITE_SPANS 0
56#define NO_COPY 0
57#define NO_COPY_BOXES 0
58#define NO_FILL 0
59#define NO_FILL_BOXES 0
60#define NO_FILL_ONE 0
61#define NO_FILL_CLEAR 0
62
63#define USE_8_PIXEL_DISPATCH 1
64#define USE_16_PIXEL_DISPATCH 1
65#define USE_32_PIXEL_DISPATCH 0
66
67#if !USE_8_PIXEL_DISPATCH && !USE_16_PIXEL_DISPATCH && !USE_32_PIXEL_DISPATCH
68#error "Must select at least 8, 16 or 32 pixel dispatch"
69#endif
70
71#define GEN7_MAX_SIZE 16384
72
73/* XXX Todo
74 *
75 * STR (software tiled rendering) mode. No, really.
76 * 64x32 pixel blocks align with the rendering cache. Worth considering.
77 */
78
79#define is_aligned(x, y) (((x) & ((y) - 1)) == 0)
80
81struct gt_info {
82	const char *name;
83	uint32_t max_vs_threads;
84	uint32_t max_gs_threads;
85	uint32_t max_wm_threads;
86	struct {
87		int size;
88		int max_vs_entries;
89		int max_gs_entries;
90		int push_ps_size; /* in 1KBs */
91	} urb;
92	int gt;
93	uint32_t mocs;
94};
95
96static const struct gt_info ivb_gt_info = {
97	.name = "Ivybridge (gen7)",
98	.max_vs_threads = 16,
99	.max_gs_threads = 16,
100	.max_wm_threads = (16-1) << IVB_PS_MAX_THREADS_SHIFT,
101	.urb = { 128, 64, 64, 8 },
102	.gt = 0,
103};
104
105static const struct gt_info ivb_gt1_info = {
106	.name = "Ivybridge (gen7, gt1)",
107	.max_vs_threads = 36,
108	.max_gs_threads = 36,
109	.max_wm_threads = (48-1) << IVB_PS_MAX_THREADS_SHIFT,
110	.urb = { 128, 512, 192, 8 },
111	.gt = 1,
112	.mocs = 3,
113};
114
115static const struct gt_info ivb_gt2_info = {
116	.name = "Ivybridge (gen7, gt2)",
117	.max_vs_threads = 128,
118	.max_gs_threads = 128,
119	.max_wm_threads = (172-1) << IVB_PS_MAX_THREADS_SHIFT,
120	.urb = { 256, 704, 320, 8 },
121	.gt = 2,
122	.mocs = 3,
123};
124
125static const struct gt_info byt_gt_info = {
126	.name = "Baytrail (gen7)",
127	.max_vs_threads = 36,
128	.max_gs_threads = 36,
129	.max_wm_threads = (48-1) << IVB_PS_MAX_THREADS_SHIFT,
130	.urb = { 128, 512, 192, 8 },
131	.gt = 1,
132};
133
134static const struct gt_info hsw_gt_info = {
135	.name = "Haswell (gen7.5)",
136	.max_vs_threads = 8,
137	.max_gs_threads = 8,
138	.max_wm_threads =
139		(8 - 1) << HSW_PS_MAX_THREADS_SHIFT |
140		1 << HSW_PS_SAMPLE_MASK_SHIFT,
141	.urb = { 128, 64, 64, 8 },
142	.gt = 0,
143};
144
145static const struct gt_info hsw_gt1_info = {
146	.name = "Haswell (gen7.5, gt1)",
147	.max_vs_threads = 70,
148	.max_gs_threads = 70,
149	.max_wm_threads =
150		(70 - 1) << HSW_PS_MAX_THREADS_SHIFT |
151		1 << HSW_PS_SAMPLE_MASK_SHIFT,
152	.urb = { 128, 640, 256, 8 },
153	.gt = 1,
154	.mocs = 5,
155};
156
157static const struct gt_info hsw_gt2_info = {
158	.name = "Haswell (gen7.5, gt2)",
159	.max_vs_threads = 140,
160	.max_gs_threads = 140,
161	.max_wm_threads =
162		(140 - 1) << HSW_PS_MAX_THREADS_SHIFT |
163		1 << HSW_PS_SAMPLE_MASK_SHIFT,
164	.urb = { 256, 1664, 640, 8 },
165	.gt = 2,
166	.mocs = 5,
167};
168
169static const struct gt_info hsw_gt3_info = {
170	.name = "Haswell (gen7.5, gt3)",
171	.max_vs_threads = 280,
172	.max_gs_threads = 280,
173	.max_wm_threads =
174		(280 - 1) << HSW_PS_MAX_THREADS_SHIFT |
175		1 << HSW_PS_SAMPLE_MASK_SHIFT,
176	.urb = { 512, 3328, 1280, 16 },
177	.gt = 3,
178	.mocs = 5,
179};
180
181inline static bool is_ivb(struct sna *sna)
182{
183	return sna->kgem.gen == 070;
184}
185
186inline static bool is_byt(struct sna *sna)
187{
188	return sna->kgem.gen == 071;
189}
190
191inline static bool is_hsw(struct sna *sna)
192{
193	return sna->kgem.gen == 075;
194}
195
196static const uint32_t ps_kernel_packed_bt601[][4] = {
197#include "exa_wm_src_affine.g7b"
198#include "exa_wm_src_sample_argb.g7b"
199#include "exa_wm_yuv_rgb_bt601.g7b"
200#include "exa_wm_write.g7b"
201};
202
203static const uint32_t ps_kernel_planar_bt601[][4] = {
204#include "exa_wm_src_affine.g7b"
205#include "exa_wm_src_sample_planar.g7b"
206#include "exa_wm_yuv_rgb_bt601.g7b"
207#include "exa_wm_write.g7b"
208};
209
210static const uint32_t ps_kernel_nv12_bt601[][4] = {
211#include "exa_wm_src_affine.g7b"
212#include "exa_wm_src_sample_nv12.g7b"
213#include "exa_wm_yuv_rgb_bt601.g7b"
214#include "exa_wm_write.g7b"
215};
216
217static const uint32_t ps_kernel_packed_bt709[][4] = {
218#include "exa_wm_src_affine.g7b"
219#include "exa_wm_src_sample_argb.g7b"
220#include "exa_wm_yuv_rgb_bt709.g7b"
221#include "exa_wm_write.g7b"
222};
223
224static const uint32_t ps_kernel_planar_bt709[][4] = {
225#include "exa_wm_src_affine.g7b"
226#include "exa_wm_src_sample_planar.g7b"
227#include "exa_wm_yuv_rgb_bt709.g7b"
228#include "exa_wm_write.g7b"
229};
230
231static const uint32_t ps_kernel_nv12_bt709[][4] = {
232#include "exa_wm_src_affine.g7b"
233#include "exa_wm_src_sample_nv12.g7b"
234#include "exa_wm_yuv_rgb_bt709.g7b"
235#include "exa_wm_write.g7b"
236};
237
238static const uint32_t ps_kernel_rgb[][4] = {
239#include "exa_wm_src_affine.g7b"
240#include "exa_wm_src_sample_argb.g7b"
241#include "exa_wm_write.g7b"
242};
243
244#define KERNEL(kernel_enum, kernel, num_surfaces) \
245    [GEN7_WM_KERNEL_##kernel_enum] = {#kernel_enum, kernel, sizeof(kernel), num_surfaces}
246#define NOKERNEL(kernel_enum, func, num_surfaces) \
247    [GEN7_WM_KERNEL_##kernel_enum] = {#kernel_enum, (void *)func, 0, num_surfaces}
248static const struct wm_kernel_info {
249	const char *name;
250	const void *data;
251	unsigned int size;
252	int num_surfaces;
253} wm_kernels[GEN7_WM_KERNEL_COUNT] = {
254	NOKERNEL(NOMASK, brw_wm_kernel__affine, 2),
255	NOKERNEL(NOMASK_P, brw_wm_kernel__projective, 2),
256
257	NOKERNEL(MASK, brw_wm_kernel__affine_mask, 3),
258	NOKERNEL(MASK_P, brw_wm_kernel__projective_mask, 3),
259
260	NOKERNEL(MASKCA, brw_wm_kernel__affine_mask_ca, 3),
261	NOKERNEL(MASKCA_P, brw_wm_kernel__projective_mask_ca, 3),
262
263	NOKERNEL(MASKSA, brw_wm_kernel__affine_mask_sa, 3),
264	NOKERNEL(MASKSA_P, brw_wm_kernel__projective_mask_sa, 3),
265
266	NOKERNEL(OPACITY, brw_wm_kernel__affine_opacity, 2),
267	NOKERNEL(OPACITY_P, brw_wm_kernel__projective_opacity, 2),
268
269	KERNEL(VIDEO_PLANAR_BT601, ps_kernel_planar_bt601, 7),
270	KERNEL(VIDEO_NV12_BT601, ps_kernel_nv12_bt601, 7),
271	KERNEL(VIDEO_PACKED_BT601, ps_kernel_packed_bt601, 2),
272	KERNEL(VIDEO_PLANAR_BT709, ps_kernel_planar_bt709, 7),
273	KERNEL(VIDEO_NV12_BT709, ps_kernel_nv12_bt709, 7),
274	KERNEL(VIDEO_PACKED_BT709, ps_kernel_packed_bt709, 2),
275	KERNEL(VIDEO_RGB, ps_kernel_rgb, 2),
276};
277#undef KERNEL
278
279static const struct blendinfo {
280	bool src_alpha;
281	uint32_t src_blend;
282	uint32_t dst_blend;
283} gen7_blend_op[] = {
284	/* Clear */	{0, GEN7_BLENDFACTOR_ZERO, GEN7_BLENDFACTOR_ZERO},
285	/* Src */	{0, GEN7_BLENDFACTOR_ONE, GEN7_BLENDFACTOR_ZERO},
286	/* Dst */	{0, GEN7_BLENDFACTOR_ZERO, GEN7_BLENDFACTOR_ONE},
287	/* Over */	{1, GEN7_BLENDFACTOR_ONE, GEN7_BLENDFACTOR_INV_SRC_ALPHA},
288	/* OverReverse */ {0, GEN7_BLENDFACTOR_INV_DST_ALPHA, GEN7_BLENDFACTOR_ONE},
289	/* In */	{0, GEN7_BLENDFACTOR_DST_ALPHA, GEN7_BLENDFACTOR_ZERO},
290	/* InReverse */	{1, GEN7_BLENDFACTOR_ZERO, GEN7_BLENDFACTOR_SRC_ALPHA},
291	/* Out */	{0, GEN7_BLENDFACTOR_INV_DST_ALPHA, GEN7_BLENDFACTOR_ZERO},
292	/* OutReverse */ {1, GEN7_BLENDFACTOR_ZERO, GEN7_BLENDFACTOR_INV_SRC_ALPHA},
293	/* Atop */	{1, GEN7_BLENDFACTOR_DST_ALPHA, GEN7_BLENDFACTOR_INV_SRC_ALPHA},
294	/* AtopReverse */ {1, GEN7_BLENDFACTOR_INV_DST_ALPHA, GEN7_BLENDFACTOR_SRC_ALPHA},
295	/* Xor */	{1, GEN7_BLENDFACTOR_INV_DST_ALPHA, GEN7_BLENDFACTOR_INV_SRC_ALPHA},
296	/* Add */	{0, GEN7_BLENDFACTOR_ONE, GEN7_BLENDFACTOR_ONE},
297};
298
299/**
300 * Highest-valued BLENDFACTOR used in gen7_blend_op.
301 *
302 * This leaves out GEN7_BLENDFACTOR_INV_DST_COLOR,
303 * GEN7_BLENDFACTOR_INV_CONST_{COLOR,ALPHA},
304 * GEN7_BLENDFACTOR_INV_SRC1_{COLOR,ALPHA}
305 */
306#define GEN7_BLENDFACTOR_COUNT (GEN7_BLENDFACTOR_INV_DST_ALPHA + 1)
307
308#define GEN7_BLEND_STATE_PADDED_SIZE	ALIGN(sizeof(struct gen7_blend_state), 64)
309
310#define BLEND_OFFSET(s, d) \
311	((d != GEN7_BLENDFACTOR_ZERO) << 15 | \
312	 (((s) * GEN7_BLENDFACTOR_COUNT + (d)) * GEN7_BLEND_STATE_PADDED_SIZE))
313
314#define NO_BLEND BLEND_OFFSET(GEN7_BLENDFACTOR_ONE, GEN7_BLENDFACTOR_ZERO)
315#define CLEAR BLEND_OFFSET(GEN7_BLENDFACTOR_ZERO, GEN7_BLENDFACTOR_ZERO)
316
317#define SAMPLER_OFFSET(sf, se, mf, me) \
318	((((((sf) * EXTEND_COUNT + (se)) * FILTER_COUNT + (mf)) * EXTEND_COUNT + (me)) + 2) * 2 * sizeof(struct gen7_sampler_state))
319
320#define VERTEX_2s2s 0
321
322#define COPY_SAMPLER 0
323#define COPY_VERTEX VERTEX_2s2s
324#define COPY_FLAGS(a) GEN7_SET_FLAGS(COPY_SAMPLER, (a) == GXcopy ? NO_BLEND : CLEAR, GEN7_WM_KERNEL_NOMASK, COPY_VERTEX)
325
326#define FILL_SAMPLER (2 * sizeof(struct gen7_sampler_state))
327#define FILL_VERTEX VERTEX_2s2s
328#define FILL_FLAGS(op, format) GEN7_SET_FLAGS(FILL_SAMPLER, gen7_get_blend((op), false, (format)), GEN7_WM_KERNEL_NOMASK, FILL_VERTEX)
329#define FILL_FLAGS_NOBLEND GEN7_SET_FLAGS(FILL_SAMPLER, NO_BLEND, GEN7_WM_KERNEL_NOMASK, FILL_VERTEX)
330
331#define GEN7_SAMPLER(f) (((f) >> 16) & 0xfff0)
332#define GEN7_BLEND(f) (((f) >> 0) & 0x7ff0)
333#define GEN7_READS_DST(f) (((f) >> 15) & 1)
334#define GEN7_KERNEL(f) (((f) >> 16) & 0xf)
335#define GEN7_VERTEX(f) (((f) >> 0) & 0xf)
336#define GEN7_SET_FLAGS(S, B, K, V)  (((S) | (K)) << 16 | ((B) | (V)))
337
338#define OUT_BATCH(v) batch_emit(sna, v)
339#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
340#define OUT_VERTEX_F(v) vertex_emit(sna, v)
341
342static inline bool too_large(int width, int height)
343{
344	return width > GEN7_MAX_SIZE || height > GEN7_MAX_SIZE;
345}
346
347static uint32_t gen7_get_blend(int op,
348			       bool has_component_alpha,
349			       uint32_t dst_format)
350{
351	uint32_t src, dst;
352
353	src = gen7_blend_op[op].src_blend;
354	dst = gen7_blend_op[op].dst_blend;
355
356	/* If there's no dst alpha channel, adjust the blend op so that
357	 * we'll treat it always as 1.
358	 */
359	if (PICT_FORMAT_A(dst_format) == 0) {
360		if (src == GEN7_BLENDFACTOR_DST_ALPHA)
361			src = GEN7_BLENDFACTOR_ONE;
362		else if (src == GEN7_BLENDFACTOR_INV_DST_ALPHA)
363			src = GEN7_BLENDFACTOR_ZERO;
364	}
365
366	/* If the source alpha is being used, then we should only be in a
367	 * case where the source blend factor is 0, and the source blend
368	 * value is the mask channels multiplied by the source picture's alpha.
369	 */
370	if (has_component_alpha && gen7_blend_op[op].src_alpha) {
371		if (dst == GEN7_BLENDFACTOR_SRC_ALPHA)
372			dst = GEN7_BLENDFACTOR_SRC_COLOR;
373		else if (dst == GEN7_BLENDFACTOR_INV_SRC_ALPHA)
374			dst = GEN7_BLENDFACTOR_INV_SRC_COLOR;
375	}
376
377	DBG(("blend op=%d, dst=%x [A=%d] => src=%d, dst=%d => offset=%x\n",
378	     op, dst_format, PICT_FORMAT_A(dst_format),
379	     src, dst, (int)BLEND_OFFSET(src, dst)));
380	return BLEND_OFFSET(src, dst);
381}
382
383static uint32_t gen7_get_card_format(PictFormat format)
384{
385	switch (format) {
386	default:
387		return -1;
388	case PICT_a8r8g8b8:
389		return GEN7_SURFACEFORMAT_B8G8R8A8_UNORM;
390	case PICT_x8r8g8b8:
391		return GEN7_SURFACEFORMAT_B8G8R8X8_UNORM;
392	case PICT_a8b8g8r8:
393		return GEN7_SURFACEFORMAT_R8G8B8A8_UNORM;
394	case PICT_x8b8g8r8:
395		return GEN7_SURFACEFORMAT_R8G8B8X8_UNORM;
396#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
397	case PICT_a2r10g10b10:
398		return GEN7_SURFACEFORMAT_B10G10R10A2_UNORM;
399	case PICT_x2r10g10b10:
400		return GEN7_SURFACEFORMAT_B10G10R10X2_UNORM;
401#endif
402	case PICT_r8g8b8:
403		return GEN7_SURFACEFORMAT_R8G8B8_UNORM;
404	case PICT_r5g6b5:
405		return GEN7_SURFACEFORMAT_B5G6R5_UNORM;
406	case PICT_a1r5g5b5:
407		return GEN7_SURFACEFORMAT_B5G5R5A1_UNORM;
408	case PICT_a8:
409		return GEN7_SURFACEFORMAT_A8_UNORM;
410	case PICT_a4r4g4b4:
411		return GEN7_SURFACEFORMAT_B4G4R4A4_UNORM;
412	}
413}
414
415static uint32_t gen7_get_dest_format(PictFormat format)
416{
417	switch (format) {
418	default:
419		return -1;
420	case PICT_a8r8g8b8:
421	case PICT_x8r8g8b8:
422		return GEN7_SURFACEFORMAT_B8G8R8A8_UNORM;
423	case PICT_a8b8g8r8:
424	case PICT_x8b8g8r8:
425		return GEN7_SURFACEFORMAT_R8G8B8A8_UNORM;
426#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
427	case PICT_a2r10g10b10:
428	case PICT_x2r10g10b10:
429		return GEN7_SURFACEFORMAT_B10G10R10A2_UNORM;
430#endif
431	case PICT_r5g6b5:
432		return GEN7_SURFACEFORMAT_B5G6R5_UNORM;
433	case PICT_x1r5g5b5:
434	case PICT_a1r5g5b5:
435		return GEN7_SURFACEFORMAT_B5G5R5A1_UNORM;
436	case PICT_a8:
437		return GEN7_SURFACEFORMAT_A8_UNORM;
438	case PICT_a4r4g4b4:
439	case PICT_x4r4g4b4:
440		return GEN7_SURFACEFORMAT_B4G4R4A4_UNORM;
441	}
442}
443
444static bool gen7_check_dst_format(PictFormat format)
445{
446	if (gen7_get_dest_format(format) != -1)
447		return true;
448
449	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
450	return false;
451}
452
453static bool gen7_check_format(uint32_t format)
454{
455	if (gen7_get_card_format(format) != -1)
456		return true;
457
458	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
459	return false;
460}
461
462static uint32_t gen7_filter(uint32_t filter)
463{
464	switch (filter) {
465	default:
466		assert(0);
467	case PictFilterNearest:
468		return SAMPLER_FILTER_NEAREST;
469	case PictFilterBilinear:
470		return SAMPLER_FILTER_BILINEAR;
471	}
472}
473
474static uint32_t gen7_check_filter(PicturePtr picture)
475{
476	switch (picture->filter) {
477	case PictFilterNearest:
478	case PictFilterBilinear:
479		return true;
480	default:
481		return false;
482	}
483}
484
485static uint32_t gen7_repeat(uint32_t repeat)
486{
487	switch (repeat) {
488	default:
489		assert(0);
490	case RepeatNone:
491		return SAMPLER_EXTEND_NONE;
492	case RepeatNormal:
493		return SAMPLER_EXTEND_REPEAT;
494	case RepeatPad:
495		return SAMPLER_EXTEND_PAD;
496	case RepeatReflect:
497		return SAMPLER_EXTEND_REFLECT;
498	}
499}
500
501static bool gen7_check_repeat(PicturePtr picture)
502{
503	if (!picture->repeat)
504		return true;
505
506	switch (picture->repeatType) {
507	case RepeatNone:
508	case RepeatNormal:
509	case RepeatPad:
510	case RepeatReflect:
511		return true;
512	default:
513		return false;
514	}
515}
516
517static int
518gen7_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
519{
520	int base;
521
522	if (has_mask) {
523		if (is_ca) {
524			if (gen7_blend_op[op].src_alpha)
525				base = GEN7_WM_KERNEL_MASKSA;
526			else
527				base = GEN7_WM_KERNEL_MASKCA;
528		} else
529			base = GEN7_WM_KERNEL_MASK;
530	} else
531		base = GEN7_WM_KERNEL_NOMASK;
532
533	return base + !is_affine;
534}
535
536static void
537gen7_emit_urb(struct sna *sna)
538{
539	OUT_BATCH(GEN7_3DSTATE_PUSH_CONSTANT_ALLOC_PS | (2 - 2));
540	OUT_BATCH(sna->render_state.gen7.info->urb.push_ps_size);
541
542	/* num of VS entries must be divisible by 8 if size < 9 */
543	OUT_BATCH(GEN7_3DSTATE_URB_VS | (2 - 2));
544	OUT_BATCH((sna->render_state.gen7.info->urb.max_vs_entries << GEN7_URB_ENTRY_NUMBER_SHIFT) |
545		  (2 - 1) << GEN7_URB_ENTRY_SIZE_SHIFT |
546		  (1 << GEN7_URB_STARTING_ADDRESS_SHIFT));
547
548	OUT_BATCH(GEN7_3DSTATE_URB_HS | (2 - 2));
549	OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
550		  (2 << GEN7_URB_STARTING_ADDRESS_SHIFT));
551
552	OUT_BATCH(GEN7_3DSTATE_URB_DS | (2 - 2));
553	OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
554		  (2 << GEN7_URB_STARTING_ADDRESS_SHIFT));
555
556	OUT_BATCH(GEN7_3DSTATE_URB_GS | (2 - 2));
557	OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
558		  (1 << GEN7_URB_STARTING_ADDRESS_SHIFT));
559}
560
561static void
562gen7_emit_state_base_address(struct sna *sna)
563{
564	uint32_t mocs = sna->render_state.gen7.info->mocs << 8;
565
566	OUT_BATCH(GEN7_STATE_BASE_ADDRESS | (10 - 2));
567	OUT_BATCH(0); /* general */
568	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* surface */
569				 sna->kgem.nbatch,
570				 NULL,
571				 I915_GEM_DOMAIN_INSTRUCTION << 16,
572				 BASE_ADDRESS_MODIFY));
573	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* dynamic */
574				 sna->kgem.nbatch,
575				 sna->render_state.gen7.general_bo,
576				 I915_GEM_DOMAIN_INSTRUCTION << 16,
577				 mocs | BASE_ADDRESS_MODIFY));
578	OUT_BATCH(0); /* indirect */
579	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* instruction */
580				 sna->kgem.nbatch,
581				 sna->render_state.gen7.general_bo,
582				 I915_GEM_DOMAIN_INSTRUCTION << 16,
583				 mocs | BASE_ADDRESS_MODIFY));
584
585	/* upper bounds, disable */
586	OUT_BATCH(0);
587	OUT_BATCH(BASE_ADDRESS_MODIFY);
588	OUT_BATCH(0);
589	OUT_BATCH(BASE_ADDRESS_MODIFY);
590}
591
592static void
593gen7_disable_vs(struct sna *sna)
594{
595	/* For future reference:
596	 * A PIPE_CONTROL with post-sync op set to 1 and a depth stall needs
597	 * to be emitted just prior to change VS state, i.e. 3DSTATE_VS,
598	 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
599	 * 3DSTATE_BINDING_TABLE_POINTER_VS, 3DSTATE_SAMPLER_STATE_POINTER_VS.
600	 *
601	 * Here we saved by the full-flush incurred when emitting
602	 * the batchbuffer.
603	 */
604	OUT_BATCH(GEN7_3DSTATE_VS | (6 - 2));
605	OUT_BATCH(0); /* no VS kernel */
606	OUT_BATCH(0);
607	OUT_BATCH(0);
608	OUT_BATCH(0);
609	OUT_BATCH(0); /* pass-through */
610
611#if 0
612	OUT_BATCH(GEN7_3DSTATE_CONSTANT_VS | (7 - 2));
613	OUT_BATCH(0);
614	OUT_BATCH(0);
615	OUT_BATCH(0);
616	OUT_BATCH(0);
617	OUT_BATCH(0);
618	OUT_BATCH(0);
619
620	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_VS | (2 - 2));
621	OUT_BATCH(0);
622
623	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_VS | (2 - 2));
624	OUT_BATCH(0);
625#endif
626}
627
628static void
629gen7_disable_hs(struct sna *sna)
630{
631	OUT_BATCH(GEN7_3DSTATE_HS | (7 - 2));
632	OUT_BATCH(0); /* no HS kernel */
633	OUT_BATCH(0);
634	OUT_BATCH(0);
635	OUT_BATCH(0);
636	OUT_BATCH(0);
637	OUT_BATCH(0); /* pass-through */
638
639#if 0
640	OUT_BATCH(GEN7_3DSTATE_CONSTANT_HS | (7 - 2));
641	OUT_BATCH(0);
642	OUT_BATCH(0);
643	OUT_BATCH(0);
644	OUT_BATCH(0);
645	OUT_BATCH(0);
646	OUT_BATCH(0);
647
648	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_HS | (2 - 2));
649	OUT_BATCH(0);
650
651	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_HS | (2 - 2));
652	OUT_BATCH(0);
653#endif
654}
655
656static void
657gen7_disable_te(struct sna *sna)
658{
659	OUT_BATCH(GEN7_3DSTATE_TE | (4 - 2));
660	OUT_BATCH(0);
661	OUT_BATCH(0);
662	OUT_BATCH(0);
663}
664
665static void
666gen7_disable_ds(struct sna *sna)
667{
668	OUT_BATCH(GEN7_3DSTATE_DS | (6 - 2));
669	OUT_BATCH(0);
670	OUT_BATCH(0);
671	OUT_BATCH(0);
672	OUT_BATCH(0);
673	OUT_BATCH(0);
674
675#if 0
676	OUT_BATCH(GEN7_3DSTATE_CONSTANT_DS | (7 - 2));
677	OUT_BATCH(0);
678	OUT_BATCH(0);
679	OUT_BATCH(0);
680	OUT_BATCH(0);
681	OUT_BATCH(0);
682	OUT_BATCH(0);
683
684	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_DS | (2 - 2));
685	OUT_BATCH(0);
686
687	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_DS | (2 - 2));
688	OUT_BATCH(0);
689#endif
690}
691
692static void
693gen7_disable_gs(struct sna *sna)
694{
695	OUT_BATCH(GEN7_3DSTATE_GS | (7 - 2));
696	OUT_BATCH(0); /* no GS kernel */
697	OUT_BATCH(0);
698	OUT_BATCH(0);
699	OUT_BATCH(0);
700	OUT_BATCH(0);
701	OUT_BATCH(0); /* pass-through */
702
703#if 0
704	OUT_BATCH(GEN7_3DSTATE_CONSTANT_GS | (7 - 2));
705	OUT_BATCH(0);
706	OUT_BATCH(0);
707	OUT_BATCH(0);
708	OUT_BATCH(0);
709	OUT_BATCH(0);
710	OUT_BATCH(0);
711
712	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_GS | (2 - 2));
713	OUT_BATCH(0);
714
715	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_GS | (2 - 2));
716	OUT_BATCH(0);
717#endif
718}
719
720static void
721gen7_disable_streamout(struct sna *sna)
722{
723	OUT_BATCH(GEN7_3DSTATE_STREAMOUT | (3 - 2));
724	OUT_BATCH(0);
725	OUT_BATCH(0);
726}
727
728static void
729gen7_emit_sf_invariant(struct sna *sna)
730{
731	OUT_BATCH(GEN7_3DSTATE_SF | (7 - 2));
732	OUT_BATCH(0);
733	OUT_BATCH(GEN7_3DSTATE_SF_CULL_NONE);
734	OUT_BATCH(2 << GEN7_3DSTATE_SF_TRIFAN_PROVOKE_SHIFT);
735	OUT_BATCH(0);
736	OUT_BATCH(0);
737	OUT_BATCH(0);
738}
739
740static void
741gen7_emit_cc_invariant(struct sna *sna)
742{
743#if 0 /* unused, no change */
744	OUT_BATCH(GEN7_3DSTATE_CC_STATE_POINTERS | (2 - 2));
745	OUT_BATCH(0);
746
747	OUT_BATCH(GEN7_3DSTATE_DEPTH_STENCIL_STATE_POINTERS | (2 - 2));
748	OUT_BATCH(0);
749#endif
750
751	/* XXX clear to be safe */
752	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC | (2 - 2));
753	OUT_BATCH(0);
754}
755
756static void
757gen7_disable_clip(struct sna *sna)
758{
759	OUT_BATCH(GEN7_3DSTATE_CLIP | (4 - 2));
760	OUT_BATCH(0);
761	OUT_BATCH(0); /* pass-through */
762	OUT_BATCH(0);
763
764	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL | (2 - 2));
765	OUT_BATCH(0);
766}
767
768static void
769gen7_emit_wm_invariant(struct sna *sna)
770{
771	OUT_BATCH(GEN7_3DSTATE_WM | (3 - 2));
772	OUT_BATCH(GEN7_WM_DISPATCH_ENABLE |
773		  GEN7_WM_PERSPECTIVE_PIXEL_BARYCENTRIC);
774	OUT_BATCH(0);
775
776#if 0
777	/* XXX length bias of 7 in old spec? */
778	OUT_BATCH(GEN7_3DSTATE_CONSTANT_PS | (7 - 2));
779	OUT_BATCH(0);
780	OUT_BATCH(0);
781	OUT_BATCH(0);
782	OUT_BATCH(0);
783	OUT_BATCH(0);
784	OUT_BATCH(0);
785#endif
786}
787
788static void
789gen7_emit_null_depth_buffer(struct sna *sna)
790{
791	OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER | (7 - 2));
792	OUT_BATCH(GEN7_SURFACE_NULL << GEN7_3DSTATE_DEPTH_BUFFER_TYPE_SHIFT |
793		  GEN7_DEPTHFORMAT_D32_FLOAT << GEN7_3DSTATE_DEPTH_BUFFER_FORMAT_SHIFT);
794	OUT_BATCH(0); /* disable depth, stencil and hiz */
795	OUT_BATCH(0);
796	OUT_BATCH(0);
797	OUT_BATCH(0);
798	OUT_BATCH(0);
799
800#if 0
801	OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS | (3 - 2));
802	OUT_BATCH(0);
803	OUT_BATCH(0);
804#endif
805}
806
807static void
808gen7_emit_invariant(struct sna *sna)
809{
810	OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_3D);
811
812	OUT_BATCH(GEN7_3DSTATE_MULTISAMPLE | (4 - 2));
813	OUT_BATCH(GEN7_3DSTATE_MULTISAMPLE_PIXEL_LOCATION_CENTER |
814		  GEN7_3DSTATE_MULTISAMPLE_NUMSAMPLES_1); /* 1 sample/pixel */
815	OUT_BATCH(0);
816	OUT_BATCH(0);
817
818	OUT_BATCH(GEN7_3DSTATE_SAMPLE_MASK | (2 - 2));
819	OUT_BATCH(1);
820
821	gen7_emit_urb(sna);
822
823	gen7_emit_state_base_address(sna);
824
825	gen7_disable_vs(sna);
826	gen7_disable_hs(sna);
827	gen7_disable_te(sna);
828	gen7_disable_ds(sna);
829	gen7_disable_gs(sna);
830	gen7_disable_clip(sna);
831	gen7_emit_sf_invariant(sna);
832	gen7_emit_wm_invariant(sna);
833	gen7_emit_cc_invariant(sna);
834	gen7_disable_streamout(sna);
835	gen7_emit_null_depth_buffer(sna);
836
837	sna->render_state.gen7.needs_invariant = false;
838}
839
840static void
841gen7_emit_cc(struct sna *sna, uint32_t blend_offset)
842{
843	struct gen7_render_state *render = &sna->render_state.gen7;
844
845	if (render->blend == blend_offset)
846		return;
847
848	DBG(("%s: blend = %x\n", __FUNCTION__, blend_offset));
849
850	/* XXX can have up to 8 blend states preload, selectable via
851	 * Render Target Index. What other side-effects of Render Target Index?
852	 */
853
854	assert (is_aligned(render->cc_blend + blend_offset, 64));
855	OUT_BATCH(GEN7_3DSTATE_BLEND_STATE_POINTERS | (2 - 2));
856	OUT_BATCH((render->cc_blend + blend_offset) | 1);
857
858	render->blend = blend_offset;
859}
860
861static void
862gen7_emit_sampler(struct sna *sna, uint32_t state)
863{
864	if (sna->render_state.gen7.samplers == state)
865		return;
866
867	sna->render_state.gen7.samplers = state;
868
869	DBG(("%s: sampler = %x\n", __FUNCTION__, state));
870
871	assert (is_aligned(sna->render_state.gen7.wm_state + state, 32));
872	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS | (2 - 2));
873	OUT_BATCH(sna->render_state.gen7.wm_state + state);
874}
875
876static void
877gen7_emit_sf(struct sna *sna, bool has_mask)
878{
879	int num_sf_outputs = has_mask ? 2 : 1;
880
881	if (sna->render_state.gen7.num_sf_outputs == num_sf_outputs)
882		return;
883
884	DBG(("%s: num_sf_outputs=%d, read_length=%d, read_offset=%d\n",
885	     __FUNCTION__, num_sf_outputs, 1, 0));
886
887	sna->render_state.gen7.num_sf_outputs = num_sf_outputs;
888
889	OUT_BATCH(GEN7_3DSTATE_SBE | (14 - 2));
890	OUT_BATCH(num_sf_outputs << GEN7_SBE_NUM_OUTPUTS_SHIFT |
891		  1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
892		  1 << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT);
893	OUT_BATCH(0);
894	OUT_BATCH(0); /* dw4 */
895	OUT_BATCH(0);
896	OUT_BATCH(0);
897	OUT_BATCH(0);
898	OUT_BATCH(0); /* dw8 */
899	OUT_BATCH(0);
900	OUT_BATCH(0);
901	OUT_BATCH(0);
902	OUT_BATCH(0); /* dw12 */
903	OUT_BATCH(0);
904	OUT_BATCH(0);
905}
906
907static void
908gen7_emit_wm(struct sna *sna, int kernel)
909{
910	const uint32_t *kernels;
911
912	if (sna->render_state.gen7.kernel == kernel)
913		return;
914
915	sna->render_state.gen7.kernel = kernel;
916	kernels = sna->render_state.gen7.wm_kernel[kernel];
917
918	DBG(("%s: switching to %s, num_surfaces=%d (8-wide? %d, 16-wide? %d, 32-wide? %d)\n",
919	     __FUNCTION__,
920	     wm_kernels[kernel].name,
921	     wm_kernels[kernel].num_surfaces,
922	     kernels[0], kernels[1], kernels[2]));
923
924	OUT_BATCH(GEN7_3DSTATE_PS | (8 - 2));
925	OUT_BATCH(kernels[0] ?: kernels[1] ?: kernels[2]);
926	OUT_BATCH(1 << GEN7_PS_SAMPLER_COUNT_SHIFT |
927		  wm_kernels[kernel].num_surfaces << GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
928	OUT_BATCH(0); /* scratch address */
929	OUT_BATCH(sna->render_state.gen7.info->max_wm_threads |
930		  (kernels[0] ? GEN7_PS_8_DISPATCH_ENABLE : 0) |
931		  (kernels[1] ? GEN7_PS_16_DISPATCH_ENABLE : 0) |
932		  (kernels[2] ? GEN7_PS_32_DISPATCH_ENABLE : 0) |
933		  GEN7_PS_ATTRIBUTE_ENABLE);
934	OUT_BATCH((kernels[0] ? 4 : kernels[1] ? 6 : 8) << GEN7_PS_DISPATCH_START_GRF_SHIFT_0 |
935		  8 << GEN7_PS_DISPATCH_START_GRF_SHIFT_1 |
936		  6 << GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
937	OUT_BATCH(kernels[2]);
938	OUT_BATCH(kernels[1]);
939}
940
941static bool
942gen7_emit_binding_table(struct sna *sna, uint16_t offset)
943{
944	if (sna->render_state.gen7.surface_table == offset)
945		return false;
946
947	/* Binding table pointers */
948	assert(is_aligned(4*offset, 32));
949	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS | (2 - 2));
950	OUT_BATCH(offset*4);
951
952	sna->render_state.gen7.surface_table = offset;
953	return true;
954}
955
956static bool
957gen7_emit_drawing_rectangle(struct sna *sna,
958			    const struct sna_composite_op *op)
959{
960	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
961	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
962
963	assert(!too_large(abs(op->dst.x), abs(op->dst.y)));
964	assert(!too_large(op->dst.width, op->dst.height));
965
966	if (sna->render_state.gen7.drawrect_limit == limit &&
967	    sna->render_state.gen7.drawrect_offset == offset)
968		return true;
969
970	sna->render_state.gen7.drawrect_offset = offset;
971	sna->render_state.gen7.drawrect_limit = limit;
972
973	OUT_BATCH(GEN7_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
974	OUT_BATCH(0);
975	OUT_BATCH(limit);
976	OUT_BATCH(offset);
977	return false;
978}
979
980static void
981gen7_emit_vertex_elements(struct sna *sna,
982			  const struct sna_composite_op *op)
983{
984	/*
985	 * vertex data in vertex buffer
986	 *    position: (x, y)
987	 *    texture coordinate 0: (u0, v0) if (is_affine is true) else (u0, v0, w0)
988	 *    texture coordinate 1 if (has_mask is true): same as above
989	 */
990	struct gen7_render_state *render = &sna->render_state.gen7;
991	uint32_t src_format, dw;
992	int id = GEN7_VERTEX(op->u.gen7.flags);
993	bool has_mask;
994
995	DBG(("%s: setup id=%d\n", __FUNCTION__, id));
996
997	if (render->ve_id == id)
998		return;
999	render->ve_id = id;
1000
1001	/* The VUE layout
1002	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
1003	 *    dword 4-7: position (x, y, 1.0, 1.0),
1004	 *    dword 8-11: texture coordinate 0 (u0, v0, w0, 1.0)
1005	 *    dword 12-15: texture coordinate 1 (u1, v1, w1, 1.0)
1006	 *
1007	 * dword 4-15 are fetched from vertex buffer
1008	 */
1009	has_mask = (id >> 2) != 0;
1010	OUT_BATCH(GEN7_3DSTATE_VERTEX_ELEMENTS |
1011		((2 * (3 + has_mask)) + 1 - 2));
1012
1013	OUT_BATCH(id << GEN7_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN7_VE0_VALID |
1014		  GEN7_SURFACEFORMAT_R32G32B32A32_FLOAT << GEN7_VE0_FORMAT_SHIFT |
1015		  0 << GEN7_VE0_OFFSET_SHIFT);
1016	OUT_BATCH(GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_0_SHIFT |
1017		  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_1_SHIFT |
1018		  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT |
1019		  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_3_SHIFT);
1020
1021	/* x,y */
1022	OUT_BATCH(id << GEN7_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN7_VE0_VALID |
1023		  GEN7_SURFACEFORMAT_R16G16_SSCALED << GEN7_VE0_FORMAT_SHIFT |
1024		  0 << GEN7_VE0_OFFSET_SHIFT);
1025	OUT_BATCH(GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT |
1026		  GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT |
1027		  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT |
1028		  GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_3_SHIFT);
1029
1030	/* u0, v0, w0 */
1031	DBG(("%s: first channel %d floats, offset=4b\n", __FUNCTION__, id & 3));
1032	dw = GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_3_SHIFT;
1033	switch (id & 3) {
1034	default:
1035		assert(0);
1036	case 0:
1037		src_format = GEN7_SURFACEFORMAT_R16G16_SSCALED;
1038		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
1039		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT;
1040		dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT;
1041		break;
1042	case 1:
1043		src_format = GEN7_SURFACEFORMAT_R32_FLOAT;
1044		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
1045		dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_1_SHIFT;
1046		dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT;
1047		break;
1048	case 2:
1049		src_format = GEN7_SURFACEFORMAT_R32G32_FLOAT;
1050		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
1051		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT;
1052		dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT;
1053		break;
1054	case 3:
1055		src_format = GEN7_SURFACEFORMAT_R32G32B32_FLOAT;
1056		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
1057		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT;
1058		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_2_SHIFT;
1059		break;
1060	}
1061	OUT_BATCH(id << GEN7_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN7_VE0_VALID |
1062		  src_format << GEN7_VE0_FORMAT_SHIFT |
1063		  4 << GEN7_VE0_OFFSET_SHIFT);
1064	OUT_BATCH(dw);
1065
1066	/* u1, v1, w1 */
1067	if (has_mask) {
1068		unsigned offset = 4 + ((id & 3) ?: 1) * sizeof(float);
1069		DBG(("%s: second channel %d floats, offset=%db\n", __FUNCTION__, id >> 2, offset));
1070		dw = GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_3_SHIFT;
1071		switch (id >> 2) {
1072		case 1:
1073			src_format = GEN7_SURFACEFORMAT_R32_FLOAT;
1074			dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
1075			dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_1_SHIFT;
1076			dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT;
1077			break;
1078		default:
1079			assert(0);
1080		case 2:
1081			src_format = GEN7_SURFACEFORMAT_R32G32_FLOAT;
1082			dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
1083			dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT;
1084			dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT;
1085			break;
1086		case 3:
1087			src_format = GEN7_SURFACEFORMAT_R32G32B32_FLOAT;
1088			dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
1089			dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT;
1090			dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_2_SHIFT;
1091			break;
1092		}
1093		OUT_BATCH(id << GEN7_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN7_VE0_VALID |
1094			  src_format << GEN7_VE0_FORMAT_SHIFT |
1095			  offset << GEN7_VE0_OFFSET_SHIFT);
1096		OUT_BATCH(dw);
1097	}
1098}
1099
1100inline static void
1101gen7_emit_pipe_invalidate(struct sna *sna)
1102{
1103	OUT_BATCH(GEN7_PIPE_CONTROL | (4 - 2));
1104	OUT_BATCH(GEN7_PIPE_CONTROL_WC_FLUSH |
1105		  GEN7_PIPE_CONTROL_TC_FLUSH |
1106		  GEN7_PIPE_CONTROL_CS_STALL);
1107	OUT_BATCH(0);
1108	OUT_BATCH(0);
1109	sna->render_state.gen7.pipe_controls_since_stall = 0;
1110}
1111
1112inline static void
1113gen7_emit_pipe_flush(struct sna *sna, bool need_stall)
1114{
1115	unsigned stall;
1116
1117	stall = 0;
1118	if (need_stall) {
1119		stall = GEN7_PIPE_CONTROL_CS_STALL;
1120		sna->render_state.gen7.pipe_controls_since_stall = 0;
1121	} else
1122		sna->render_state.gen7.pipe_controls_since_stall++;
1123
1124	OUT_BATCH(GEN7_PIPE_CONTROL | (4 - 2));
1125	OUT_BATCH(GEN7_PIPE_CONTROL_WC_FLUSH | stall);
1126	OUT_BATCH(0);
1127	OUT_BATCH(0);
1128}
1129
1130inline static void
1131gen7_emit_pipe_stall(struct sna *sna)
1132{
1133	OUT_BATCH(GEN7_PIPE_CONTROL | (4 - 2));
1134	OUT_BATCH(GEN7_PIPE_CONTROL_CS_STALL |
1135		  GEN7_PIPE_CONTROL_STALL_AT_SCOREBOARD);
1136	OUT_BATCH(0);
1137	OUT_BATCH(0);
1138	sna->render_state.gen7.pipe_controls_since_stall = 0;
1139}
1140
1141static void
1142gen7_emit_state(struct sna *sna,
1143		const struct sna_composite_op *op,
1144		uint16_t wm_binding_table)
1145{
1146	bool need_invalidate;
1147	bool need_flush;
1148	bool need_stall;
1149
1150	assert(op->dst.bo->exec);
1151
1152	need_flush = wm_binding_table & 1 ||
1153		(sna->render_state.gen7.emit_flush && GEN7_READS_DST(op->u.gen7.flags));
1154	if (ALWAYS_FLUSH)
1155		need_flush = true;
1156
1157	wm_binding_table &= ~1;
1158
1159	need_stall = sna->render_state.gen7.surface_table != wm_binding_table;
1160
1161	need_invalidate = kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo);
1162	if (ALWAYS_INVALIDATE)
1163		need_invalidate = true;
1164
1165	need_stall &= gen7_emit_drawing_rectangle(sna, op);
1166	if (ALWAYS_STALL)
1167		need_stall = true;
1168	if (sna->kgem.gen < 075 &&
1169	    sna->render_state.gen7.pipe_controls_since_stall >= 3)
1170		need_stall = true;
1171
1172	if (need_invalidate) {
1173		gen7_emit_pipe_invalidate(sna);
1174		kgem_clear_dirty(&sna->kgem);
1175		assert(op->dst.bo->exec);
1176		kgem_bo_mark_dirty(op->dst.bo);
1177
1178		need_flush = false;
1179		need_stall = false;
1180	}
1181	if (need_flush) {
1182		gen7_emit_pipe_flush(sna, need_stall);
1183		need_stall = false;
1184	}
1185	if (need_stall)
1186		gen7_emit_pipe_stall(sna);
1187
1188	gen7_emit_cc(sna, GEN7_BLEND(op->u.gen7.flags));
1189	gen7_emit_sampler(sna, GEN7_SAMPLER(op->u.gen7.flags));
1190	gen7_emit_sf(sna, GEN7_VERTEX(op->u.gen7.flags) >> 2);
1191	gen7_emit_wm(sna, GEN7_KERNEL(op->u.gen7.flags));
1192	gen7_emit_vertex_elements(sna, op);
1193	gen7_emit_binding_table(sna, wm_binding_table);
1194
1195	sna->render_state.gen7.emit_flush = GEN7_READS_DST(op->u.gen7.flags);
1196}
1197
1198static bool gen7_magic_ca_pass(struct sna *sna,
1199			       const struct sna_composite_op *op)
1200{
1201	struct gen7_render_state *state = &sna->render_state.gen7;
1202
1203	if (!op->need_magic_ca_pass)
1204		return false;
1205
1206	DBG(("%s: CA fixup (%d -> %d)\n", __FUNCTION__,
1207	     sna->render.vertex_start, sna->render.vertex_index));
1208
1209	gen7_emit_pipe_stall(sna);
1210
1211	gen7_emit_cc(sna,
1212		     GEN7_BLEND(gen7_get_blend(PictOpAdd, true,
1213					       op->dst.format)));
1214	gen7_emit_wm(sna,
1215		     gen7_choose_composite_kernel(PictOpAdd,
1216						  true, true,
1217						  op->is_affine));
1218
1219	OUT_BATCH(GEN7_3DPRIMITIVE | (7- 2));
1220	OUT_BATCH(GEN7_3DPRIMITIVE_VERTEX_SEQUENTIAL | _3DPRIM_RECTLIST);
1221	OUT_BATCH(sna->render.vertex_index - sna->render.vertex_start);
1222	OUT_BATCH(sna->render.vertex_start);
1223	OUT_BATCH(1);	/* single instance */
1224	OUT_BATCH(0);	/* start instance location */
1225	OUT_BATCH(0);	/* index buffer offset, ignored */
1226
1227	state->last_primitive = sna->kgem.nbatch;
1228	return true;
1229}
1230
1231static void null_create(struct sna_static_stream *stream)
1232{
1233	/* A bunch of zeros useful for legacy border color and depth-stencil */
1234	sna_static_stream_map(stream, 64, 64);
1235}
1236
1237static void
1238sampler_state_init(struct gen7_sampler_state *sampler_state,
1239		   sampler_filter_t filter,
1240		   sampler_extend_t extend)
1241{
1242	sampler_state->ss0.lod_preclamp = 1;	/* GL mode */
1243
1244	/* We use the legacy mode to get the semantics specified by
1245	 * the Render extension. */
1246	sampler_state->ss0.default_color_mode = GEN7_BORDER_COLOR_MODE_LEGACY;
1247
1248	switch (filter) {
1249	default:
1250	case SAMPLER_FILTER_NEAREST:
1251		sampler_state->ss0.min_filter = GEN7_MAPFILTER_NEAREST;
1252		sampler_state->ss0.mag_filter = GEN7_MAPFILTER_NEAREST;
1253		break;
1254	case SAMPLER_FILTER_BILINEAR:
1255		sampler_state->ss0.min_filter = GEN7_MAPFILTER_LINEAR;
1256		sampler_state->ss0.mag_filter = GEN7_MAPFILTER_LINEAR;
1257		break;
1258	}
1259
1260	switch (extend) {
1261	default:
1262	case SAMPLER_EXTEND_NONE:
1263		sampler_state->ss3.r_wrap_mode = GEN7_TEXCOORDMODE_CLAMP_BORDER;
1264		sampler_state->ss3.s_wrap_mode = GEN7_TEXCOORDMODE_CLAMP_BORDER;
1265		sampler_state->ss3.t_wrap_mode = GEN7_TEXCOORDMODE_CLAMP_BORDER;
1266		break;
1267	case SAMPLER_EXTEND_REPEAT:
1268		sampler_state->ss3.r_wrap_mode = GEN7_TEXCOORDMODE_WRAP;
1269		sampler_state->ss3.s_wrap_mode = GEN7_TEXCOORDMODE_WRAP;
1270		sampler_state->ss3.t_wrap_mode = GEN7_TEXCOORDMODE_WRAP;
1271		break;
1272	case SAMPLER_EXTEND_PAD:
1273		sampler_state->ss3.r_wrap_mode = GEN7_TEXCOORDMODE_CLAMP;
1274		sampler_state->ss3.s_wrap_mode = GEN7_TEXCOORDMODE_CLAMP;
1275		sampler_state->ss3.t_wrap_mode = GEN7_TEXCOORDMODE_CLAMP;
1276		break;
1277	case SAMPLER_EXTEND_REFLECT:
1278		sampler_state->ss3.r_wrap_mode = GEN7_TEXCOORDMODE_MIRROR;
1279		sampler_state->ss3.s_wrap_mode = GEN7_TEXCOORDMODE_MIRROR;
1280		sampler_state->ss3.t_wrap_mode = GEN7_TEXCOORDMODE_MIRROR;
1281		break;
1282	}
1283}
1284
1285static void
1286sampler_copy_init(struct gen7_sampler_state *ss)
1287{
1288	sampler_state_init(ss, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
1289	ss->ss3.non_normalized_coord = 1;
1290
1291	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
1292}
1293
1294static void
1295sampler_fill_init(struct gen7_sampler_state *ss)
1296{
1297	sampler_state_init(ss, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_REPEAT);
1298	ss->ss3.non_normalized_coord = 1;
1299
1300	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
1301}
1302
1303static uint32_t
1304gen7_tiling_bits(uint32_t tiling)
1305{
1306	switch (tiling) {
1307	default: assert(0);
1308	case I915_TILING_NONE: return 0;
1309	case I915_TILING_X: return GEN7_SURFACE_TILED;
1310	case I915_TILING_Y: return GEN7_SURFACE_TILED | GEN7_SURFACE_TILED_Y;
1311	}
1312}
1313
1314/**
1315 * Sets up the common fields for a surface state buffer for the given
1316 * picture in the given surface state buffer.
1317 */
1318static uint32_t
1319gen7_bind_bo(struct sna *sna,
1320	     struct kgem_bo *bo,
1321	     uint32_t width,
1322	     uint32_t height,
1323	     uint32_t format,
1324	     bool is_dst)
1325{
1326	uint32_t *ss;
1327	uint32_t domains;
1328	int offset;
1329	uint32_t is_scanout = is_dst && bo->scanout;
1330
1331	COMPILE_TIME_ASSERT(sizeof(struct gen7_surface_state) == 32);
1332
1333	/* After the first bind, we manage the cache domains within the batch */
1334	offset = kgem_bo_get_binding(bo, format | is_dst << 30 | is_scanout << 31);
1335	if (offset) {
1336		assert(offset >= sna->kgem.surface);
1337		if (is_dst)
1338			kgem_bo_mark_dirty(bo);
1339		return offset * sizeof(uint32_t);
1340	}
1341
1342	offset = sna->kgem.surface -=
1343		sizeof(struct gen7_surface_state) / sizeof(uint32_t);
1344	ss = sna->kgem.batch + offset;
1345	ss[0] = (GEN7_SURFACE_2D << GEN7_SURFACE_TYPE_SHIFT |
1346		 gen7_tiling_bits(bo->tiling) |
1347		 format << GEN7_SURFACE_FORMAT_SHIFT);
1348	if (bo->tiling == I915_TILING_Y)
1349		ss[0] |= GEN7_SURFACE_VALIGN_4;
1350	if (is_dst) {
1351		ss[0] |= GEN7_SURFACE_RC_READ_WRITE;
1352		domains = I915_GEM_DOMAIN_RENDER << 16 |I915_GEM_DOMAIN_RENDER;
1353	} else
1354		domains = I915_GEM_DOMAIN_SAMPLER << 16;
1355	ss[1] = kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0);
1356	ss[2] = ((width - 1)  << GEN7_SURFACE_WIDTH_SHIFT |
1357		 (height - 1) << GEN7_SURFACE_HEIGHT_SHIFT);
1358	ss[3] = (bo->pitch - 1) << GEN7_SURFACE_PITCH_SHIFT;
1359	ss[4] = 0;
1360	ss[5] = (is_scanout || bo->io) ? 0 : sna->render_state.gen7.info->mocs << 16;
1361	ss[6] = 0;
1362	ss[7] = 0;
1363	if (is_hsw(sna))
1364		ss[7] |= HSW_SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
1365
1366	kgem_bo_set_binding(bo, format | is_dst << 30 | is_scanout << 31, offset);
1367
1368	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
1369	     offset, bo->handle, ss[1],
1370	     format, width, height, bo->pitch, bo->tiling,
1371	     domains & 0xffff ? "render" : "sampler"));
1372
1373	return offset * sizeof(uint32_t);
1374}
1375
1376static void gen7_emit_vertex_buffer(struct sna *sna,
1377				    const struct sna_composite_op *op)
1378{
1379	int id = GEN7_VERTEX(op->u.gen7.flags);
1380
1381	OUT_BATCH(GEN7_3DSTATE_VERTEX_BUFFERS | (5 - 2));
1382	OUT_BATCH(id << GEN7_VB0_BUFFER_INDEX_SHIFT |
1383		  GEN7_VB0_VERTEXDATA |
1384		  GEN7_VB0_ADDRESS_MODIFY_ENABLE |
1385		  4*op->floats_per_vertex << GEN7_VB0_BUFFER_PITCH_SHIFT);
1386	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
1387	OUT_BATCH(0);
1388	OUT_BATCH(~0); /* max address: disabled */
1389	OUT_BATCH(0);
1390
1391	sna->render.vb_id |= 1 << id;
1392}
1393
1394static void gen7_emit_primitive(struct sna *sna)
1395{
1396	if (sna->kgem.nbatch == sna->render_state.gen7.last_primitive) {
1397		sna->render.vertex_offset = sna->kgem.nbatch - 5;
1398		return;
1399	}
1400
1401	OUT_BATCH(GEN7_3DPRIMITIVE | (7- 2));
1402	OUT_BATCH(GEN7_3DPRIMITIVE_VERTEX_SEQUENTIAL | _3DPRIM_RECTLIST);
1403	sna->render.vertex_offset = sna->kgem.nbatch;
1404	OUT_BATCH(0);	/* vertex count, to be filled in later */
1405	OUT_BATCH(sna->render.vertex_index);
1406	OUT_BATCH(1);	/* single instance */
1407	OUT_BATCH(0);	/* start instance location */
1408	OUT_BATCH(0);	/* index buffer offset, ignored */
1409	sna->render.vertex_start = sna->render.vertex_index;
1410
1411	sna->render_state.gen7.last_primitive = sna->kgem.nbatch;
1412}
1413
1414static bool gen7_rectangle_begin(struct sna *sna,
1415				 const struct sna_composite_op *op)
1416{
1417	int id = 1 << GEN7_VERTEX(op->u.gen7.flags);
1418	int ndwords;
1419
1420	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
1421		return true;
1422
1423	ndwords = op->need_magic_ca_pass ? 60 : 6;
1424	if ((sna->render.vb_id & id) == 0)
1425		ndwords += 5;
1426	if (!kgem_check_batch(&sna->kgem, ndwords))
1427		return false;
1428
1429	if ((sna->render.vb_id & id) == 0)
1430		gen7_emit_vertex_buffer(sna, op);
1431
1432	gen7_emit_primitive(sna);
1433	return true;
1434}
1435
1436static int gen7_get_rectangles__flush(struct sna *sna,
1437				      const struct sna_composite_op *op)
1438{
1439	/* Preventing discarding new vbo after lock contention */
1440	if (sna_vertex_wait__locked(&sna->render)) {
1441		int rem = vertex_space(sna);
1442		if (rem > op->floats_per_rect)
1443			return rem;
1444	}
1445
1446	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 65 : 6))
1447		return 0;
1448	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
1449		return 0;
1450
1451	if (sna->render.vertex_offset) {
1452		gen4_vertex_flush(sna);
1453		if (gen7_magic_ca_pass(sna, op)) {
1454			gen7_emit_pipe_stall(sna);
1455			gen7_emit_cc(sna, GEN7_BLEND(op->u.gen7.flags));
1456			gen7_emit_wm(sna, GEN7_KERNEL(op->u.gen7.flags));
1457		}
1458	}
1459
1460	return gen4_vertex_finish(sna);
1461}
1462
1463inline static int gen7_get_rectangles(struct sna *sna,
1464				      const struct sna_composite_op *op,
1465				      int want,
1466				      void (*emit_state)(struct sna *sna, const struct sna_composite_op *op))
1467{
1468	int rem;
1469
1470	assert(want);
1471
1472start:
1473	rem = vertex_space(sna);
1474	if (unlikely(rem < op->floats_per_rect)) {
1475		DBG(("flushing vbo for %s: %d < %d\n",
1476		     __FUNCTION__, rem, op->floats_per_rect));
1477		rem = gen7_get_rectangles__flush(sna, op);
1478		if (unlikely(rem == 0))
1479			goto flush;
1480	}
1481
1482	if (unlikely(sna->render.vertex_offset == 0)) {
1483		if (!gen7_rectangle_begin(sna, op))
1484			goto flush;
1485		else
1486			goto start;
1487	}
1488
1489	assert(rem <= vertex_space(sna));
1490	assert(op->floats_per_rect <= rem);
1491	if (want > 1 && want * op->floats_per_rect > rem)
1492		want = rem / op->floats_per_rect;
1493
1494	assert(want > 0);
1495	sna->render.vertex_index += 3*want;
1496	return want;
1497
1498flush:
1499	if (sna->render.vertex_offset) {
1500		gen4_vertex_flush(sna);
1501		gen7_magic_ca_pass(sna, op);
1502	}
1503	sna_vertex_wait__locked(&sna->render);
1504	_kgem_submit(&sna->kgem);
1505	emit_state(sna, op);
1506	goto start;
1507}
1508
1509inline static uint32_t *gen7_composite_get_binding_table(struct sna *sna,
1510							 uint16_t *offset)
1511{
1512	uint32_t *table;
1513
1514	sna->kgem.surface -=
1515		sizeof(struct gen7_surface_state) / sizeof(uint32_t);
1516	/* Clear all surplus entries to zero in case of prefetch */
1517	table = memset(sna->kgem.batch + sna->kgem.surface,
1518		       0, sizeof(struct gen7_surface_state));
1519
1520	DBG(("%s(%x)\n", __FUNCTION__, 4*sna->kgem.surface));
1521
1522	*offset = sna->kgem.surface;
1523	return table;
1524}
1525
1526static void
1527gen7_get_batch(struct sna *sna, const struct sna_composite_op *op)
1528{
1529	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
1530
1531	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 4)) {
1532		DBG(("%s: flushing batch: %d < %d+%d\n",
1533		     __FUNCTION__, sna->kgem.surface - sna->kgem.nbatch,
1534		     150, 4*8));
1535		_kgem_submit(&sna->kgem);
1536		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
1537	}
1538
1539	assert(sna->kgem.mode == KGEM_RENDER);
1540	assert(sna->kgem.ring == KGEM_RENDER);
1541
1542	if (sna->render_state.gen7.needs_invariant)
1543		gen7_emit_invariant(sna);
1544}
1545
1546static void gen7_emit_composite_state(struct sna *sna,
1547				      const struct sna_composite_op *op)
1548{
1549	uint32_t *binding_table;
1550	uint16_t offset, dirty;
1551
1552	gen7_get_batch(sna, op);
1553
1554	binding_table = gen7_composite_get_binding_table(sna, &offset);
1555
1556	dirty = kgem_bo_is_dirty(op->dst.bo);
1557
1558	binding_table[0] =
1559		gen7_bind_bo(sna,
1560			    op->dst.bo, op->dst.width, op->dst.height,
1561			    gen7_get_dest_format(op->dst.format),
1562			    true);
1563	binding_table[1] =
1564		gen7_bind_bo(sna,
1565			     op->src.bo, op->src.width, op->src.height,
1566			     op->src.card_format,
1567			     false);
1568	if (op->mask.bo) {
1569		binding_table[2] =
1570			gen7_bind_bo(sna,
1571				     op->mask.bo,
1572				     op->mask.width,
1573				     op->mask.height,
1574				     op->mask.card_format,
1575				     false);
1576	}
1577
1578	if (sna->kgem.surface == offset &&
1579	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen7.surface_table) == *(uint64_t*)binding_table &&
1580	    (op->mask.bo == NULL ||
1581	     sna->kgem.batch[sna->render_state.gen7.surface_table+2] == binding_table[2])) {
1582		sna->kgem.surface += sizeof(struct gen7_surface_state) / sizeof(uint32_t);
1583		offset = sna->render_state.gen7.surface_table;
1584	}
1585
1586	if (sna->kgem.batch[sna->render_state.gen7.surface_table] == binding_table[0])
1587		dirty = 0;
1588
1589	gen7_emit_state(sna, op, offset | dirty);
1590}
1591
1592static void
1593gen7_align_vertex(struct sna *sna, const struct sna_composite_op *op)
1594{
1595	if (op->floats_per_vertex != sna->render_state.gen7.floats_per_vertex) {
1596		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
1597		     sna->render_state.gen7.floats_per_vertex, op->floats_per_vertex));
1598		gen4_vertex_align(sna, op);
1599		sna->render_state.gen7.floats_per_vertex = op->floats_per_vertex;
1600	}
1601}
1602
1603fastcall static void
1604gen7_render_composite_blt(struct sna *sna,
1605			  const struct sna_composite_op *op,
1606			  const struct sna_composite_rectangles *r)
1607{
1608	gen7_get_rectangles(sna, op, 1, gen7_emit_composite_state);
1609	op->prim_emit(sna, op, r);
1610}
1611
1612fastcall static void
1613gen7_render_composite_box(struct sna *sna,
1614			  const struct sna_composite_op *op,
1615			  const BoxRec *box)
1616{
1617	struct sna_composite_rectangles r;
1618
1619	gen7_get_rectangles(sna, op, 1, gen7_emit_composite_state);
1620
1621	DBG(("  %s: (%d, %d), (%d, %d)\n",
1622	     __FUNCTION__,
1623	     box->x1, box->y1, box->x2, box->y2));
1624
1625	r.dst.x = box->x1;
1626	r.dst.y = box->y1;
1627	r.width  = box->x2 - box->x1;
1628	r.height = box->y2 - box->y1;
1629	r.src = r.mask = r.dst;
1630
1631	op->prim_emit(sna, op, &r);
1632}
1633
1634static void
1635gen7_render_composite_boxes__blt(struct sna *sna,
1636				 const struct sna_composite_op *op,
1637				 const BoxRec *box, int nbox)
1638{
1639	DBG(("composite_boxes(%d)\n", nbox));
1640
1641	do {
1642		int nbox_this_time;
1643
1644		nbox_this_time = gen7_get_rectangles(sna, op, nbox,
1645						     gen7_emit_composite_state);
1646		nbox -= nbox_this_time;
1647
1648		do {
1649			struct sna_composite_rectangles r;
1650
1651			DBG(("  %s: (%d, %d), (%d, %d)\n",
1652			     __FUNCTION__,
1653			     box->x1, box->y1, box->x2, box->y2));
1654
1655			r.dst.x = box->x1;
1656			r.dst.y = box->y1;
1657			r.width  = box->x2 - box->x1;
1658			r.height = box->y2 - box->y1;
1659			r.src = r.mask = r.dst;
1660
1661			op->prim_emit(sna, op, &r);
1662			box++;
1663		} while (--nbox_this_time);
1664	} while (nbox);
1665}
1666
1667static void
1668gen7_render_composite_boxes(struct sna *sna,
1669			    const struct sna_composite_op *op,
1670			    const BoxRec *box, int nbox)
1671{
1672	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1673
1674	do {
1675		int nbox_this_time;
1676		float *v;
1677
1678		nbox_this_time = gen7_get_rectangles(sna, op, nbox,
1679						     gen7_emit_composite_state);
1680		assert(nbox_this_time);
1681		nbox -= nbox_this_time;
1682
1683		v = sna->render.vertices + sna->render.vertex_used;
1684		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1685
1686		op->emit_boxes(op, box, nbox_this_time, v);
1687		box += nbox_this_time;
1688	} while (nbox);
1689}
1690
1691static void
1692gen7_render_composite_boxes__thread(struct sna *sna,
1693				    const struct sna_composite_op *op,
1694				    const BoxRec *box, int nbox)
1695{
1696	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1697
1698	sna_vertex_lock(&sna->render);
1699	do {
1700		int nbox_this_time;
1701		float *v;
1702
1703		nbox_this_time = gen7_get_rectangles(sna, op, nbox,
1704						     gen7_emit_composite_state);
1705		assert(nbox_this_time);
1706		nbox -= nbox_this_time;
1707
1708		v = sna->render.vertices + sna->render.vertex_used;
1709		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1710
1711		sna_vertex_acquire__locked(&sna->render);
1712		sna_vertex_unlock(&sna->render);
1713
1714		op->emit_boxes(op, box, nbox_this_time, v);
1715		box += nbox_this_time;
1716
1717		sna_vertex_lock(&sna->render);
1718		sna_vertex_release__locked(&sna->render);
1719	} while (nbox);
1720	sna_vertex_unlock(&sna->render);
1721}
1722
1723#ifndef MAX
1724#define MAX(a,b) ((a) > (b) ? (a) : (b))
1725#endif
1726
1727static uint32_t
1728gen7_composite_create_blend_state(struct sna_static_stream *stream)
1729{
1730	char *base, *ptr;
1731	int src, dst;
1732
1733	base = sna_static_stream_map(stream,
1734				     GEN7_BLENDFACTOR_COUNT * GEN7_BLENDFACTOR_COUNT * GEN7_BLEND_STATE_PADDED_SIZE,
1735				     64);
1736
1737	ptr = base;
1738	for (src = 0; src < GEN7_BLENDFACTOR_COUNT; src++) {
1739		for (dst= 0; dst < GEN7_BLENDFACTOR_COUNT; dst++) {
1740			struct gen7_blend_state *blend =
1741				(struct gen7_blend_state *)ptr;
1742
1743			blend->blend0.dest_blend_factor = dst;
1744			blend->blend0.source_blend_factor = src;
1745			blend->blend0.blend_func = GEN7_BLENDFUNCTION_ADD;
1746			blend->blend0.blend_enable =
1747				!(dst == GEN7_BLENDFACTOR_ZERO && src == GEN7_BLENDFACTOR_ONE);
1748
1749			blend->blend1.post_blend_clamp_enable = 1;
1750			blend->blend1.pre_blend_clamp_enable = 1;
1751
1752			ptr += GEN7_BLEND_STATE_PADDED_SIZE;
1753		}
1754	}
1755
1756	return sna_static_stream_offsetof(stream, base);
1757}
1758
1759static uint32_t gen7_bind_video_source(struct sna *sna,
1760				       struct kgem_bo *bo,
1761				       uint32_t offset,
1762				       int width,
1763				       int height,
1764				       int pitch,
1765				       uint32_t format)
1766{
1767	uint32_t *ss, bind;
1768
1769	bind = sna->kgem.surface -=
1770		sizeof(struct gen7_surface_state) / sizeof(uint32_t);
1771
1772	assert(bo->tiling == I915_TILING_NONE);
1773
1774	ss = sna->kgem.batch + bind;
1775	ss[0] = (GEN7_SURFACE_2D << GEN7_SURFACE_TYPE_SHIFT |
1776		 format << GEN7_SURFACE_FORMAT_SHIFT);
1777	ss[1] = kgem_add_reloc(&sna->kgem, bind + 1, bo,
1778			       I915_GEM_DOMAIN_SAMPLER << 16,
1779			       offset);
1780	ss[2] = ((width - 1)  << GEN7_SURFACE_WIDTH_SHIFT |
1781		 (height - 1) << GEN7_SURFACE_HEIGHT_SHIFT);
1782	ss[3] = (pitch - 1) << GEN7_SURFACE_PITCH_SHIFT;
1783	ss[4] = 0;
1784	ss[5] = 0;
1785	ss[6] = 0;
1786	ss[7] = 0;
1787	if (is_hsw(sna))
1788		ss[7] |= HSW_SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
1789
1790	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, offset=%d\n",
1791	     bind, bo->handle, ss[1],
1792	     format, width, height, pitch, offset));
1793
1794	return bind * sizeof(uint32_t);
1795}
1796
1797static void gen7_emit_video_state(struct sna *sna,
1798				  const struct sna_composite_op *op)
1799{
1800	struct sna_video_frame *frame = op->priv;
1801	uint32_t src_surf_format[6];
1802	uint32_t src_surf_base[6];
1803	int src_width[6];
1804	int src_height[6];
1805	int src_pitch[6];
1806	uint32_t *binding_table;
1807	uint16_t offset, dirty;
1808	int n_src, n;
1809
1810	gen7_get_batch(sna, op);
1811
1812	src_surf_base[0] = 0;
1813	src_surf_base[1] = 0;
1814	src_surf_base[2] = frame->VBufOffset;
1815	src_surf_base[3] = frame->VBufOffset;
1816	src_surf_base[4] = frame->UBufOffset;
1817	src_surf_base[5] = frame->UBufOffset;
1818
1819	if (is_planar_fourcc(frame->id)) {
1820		for (n = 0; n < 2; n++) {
1821			src_surf_format[n] = GEN7_SURFACEFORMAT_R8_UNORM;
1822			src_width[n]  = frame->width;
1823			src_height[n] = frame->height;
1824			src_pitch[n]  = frame->pitch[1];
1825		}
1826		for (; n < 6; n++) {
1827			if (is_nv12_fourcc(frame->id))
1828				src_surf_format[n] = GEN7_SURFACEFORMAT_R8G8_UNORM;
1829			else
1830				src_surf_format[n] = GEN7_SURFACEFORMAT_R8_UNORM;
1831			src_width[n] = frame->width / 2;
1832			src_height[n] = frame->height / 2;
1833			src_pitch[n] = frame->pitch[0];
1834		}
1835		n_src = 6;
1836	} else {
1837		if (frame->id == FOURCC_RGB888)
1838			src_surf_format[0] = GEN7_SURFACEFORMAT_B8G8R8X8_UNORM;
1839		else if (frame->id == FOURCC_UYVY)
1840			src_surf_format[0] = GEN7_SURFACEFORMAT_YCRCB_SWAPY;
1841		else
1842			src_surf_format[0] = GEN7_SURFACEFORMAT_YCRCB_NORMAL;
1843
1844		src_width[0]  = frame->width;
1845		src_height[0] = frame->height;
1846		src_pitch[0]  = frame->pitch[0];
1847		n_src = 1;
1848	}
1849
1850	binding_table = gen7_composite_get_binding_table(sna, &offset);
1851
1852	dirty = kgem_bo_is_dirty(op->dst.bo);
1853
1854	binding_table[0] =
1855		gen7_bind_bo(sna,
1856			     op->dst.bo, op->dst.width, op->dst.height,
1857			     gen7_get_dest_format(op->dst.format),
1858			     true);
1859	for (n = 0; n < n_src; n++) {
1860		binding_table[1+n] =
1861			gen7_bind_video_source(sna,
1862					       frame->bo,
1863					       src_surf_base[n],
1864					       src_width[n],
1865					       src_height[n],
1866					       src_pitch[n],
1867					       src_surf_format[n]);
1868	}
1869
1870	gen7_emit_state(sna, op, offset | dirty);
1871}
1872
1873static unsigned select_video_kernel(const struct sna_video *video,
1874				    const struct sna_video_frame *frame)
1875{
1876	switch (frame->id) {
1877	case FOURCC_YV12:
1878	case FOURCC_I420:
1879	case FOURCC_XVMC:
1880		return video->colorspace ?
1881			GEN7_WM_KERNEL_VIDEO_PLANAR_BT709 :
1882			GEN7_WM_KERNEL_VIDEO_PLANAR_BT601;
1883
1884	case FOURCC_NV12:
1885		return video->colorspace ?
1886			GEN7_WM_KERNEL_VIDEO_NV12_BT709 :
1887			GEN7_WM_KERNEL_VIDEO_NV12_BT601;
1888
1889	case FOURCC_RGB888:
1890	case FOURCC_RGB565:
1891		return GEN7_WM_KERNEL_VIDEO_RGB;
1892
1893	default:
1894		return video->colorspace ?
1895			GEN7_WM_KERNEL_VIDEO_PACKED_BT709 :
1896			GEN7_WM_KERNEL_VIDEO_PACKED_BT601;
1897	}
1898}
1899
1900static bool
1901gen7_render_video(struct sna *sna,
1902		  struct sna_video *video,
1903		  struct sna_video_frame *frame,
1904		  RegionPtr dstRegion,
1905		  PixmapPtr pixmap)
1906{
1907	struct sna_composite_op tmp;
1908	struct sna_pixmap *priv = sna_pixmap(pixmap);
1909	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
1910	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
1911	int src_width = frame->src.x2 - frame->src.x1;
1912	int src_height = frame->src.y2 - frame->src.y1;
1913	float src_offset_x, src_offset_y;
1914	float src_scale_x, src_scale_y;
1915	unsigned filter;
1916	const BoxRec *box;
1917	int nbox;
1918
1919	DBG(("%s: src=(%d, %d), dst=(%d, %d), %dx[(%d, %d), (%d, %d)...]\n",
1920	     __FUNCTION__,
1921	     src_width, src_height, dst_width, dst_height,
1922	     region_num_rects(dstRegion),
1923	     REGION_EXTENTS(NULL, dstRegion)->x1,
1924	     REGION_EXTENTS(NULL, dstRegion)->y1,
1925	     REGION_EXTENTS(NULL, dstRegion)->x2,
1926	     REGION_EXTENTS(NULL, dstRegion)->y2));
1927
1928	assert(priv->gpu_bo);
1929	memset(&tmp, 0, sizeof(tmp));
1930
1931	tmp.dst.pixmap = pixmap;
1932	tmp.dst.width  = pixmap->drawable.width;
1933	tmp.dst.height = pixmap->drawable.height;
1934	tmp.dst.format = sna_render_format_for_depth(pixmap->drawable.depth);
1935	tmp.dst.bo = priv->gpu_bo;
1936
1937	tmp.src.bo = frame->bo;
1938	tmp.mask.bo = NULL;
1939
1940	tmp.floats_per_vertex = 3;
1941	tmp.floats_per_rect = 9;
1942
1943	if (src_width == dst_width && src_height == dst_height)
1944		filter = SAMPLER_FILTER_NEAREST;
1945	else
1946		filter = SAMPLER_FILTER_BILINEAR;
1947
1948	tmp.u.gen7.flags =
1949		GEN7_SET_FLAGS(SAMPLER_OFFSET(filter, SAMPLER_EXTEND_PAD,
1950					      SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE),
1951			       NO_BLEND,
1952			       select_video_kernel(video, frame),
1953			       2);
1954	tmp.priv = frame;
1955
1956	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
1957	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
1958		kgem_submit(&sna->kgem);
1959		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
1960			return false;
1961
1962		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
1963	}
1964
1965	gen7_align_vertex(sna, &tmp);
1966	gen7_emit_video_state(sna, &tmp);
1967
1968	DBG(("%s: src=(%d, %d)x(%d, %d); frame=(%dx%d), dst=(%dx%d)\n",
1969	     __FUNCTION__,
1970	     frame->src.x1, frame->src.y1,
1971	     src_width, src_height,
1972	     dst_width, dst_height,
1973	     frame->width, frame->height));
1974
1975	src_scale_x = (float)src_width / dst_width / frame->width;
1976	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
1977
1978	src_scale_y = (float)src_height / dst_height / frame->height;
1979	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
1980
1981	DBG(("%s: scale=(%f, %f), offset=(%f, %f)\n",
1982	     __FUNCTION__,
1983	     src_scale_x, src_scale_y,
1984	     src_offset_x, src_offset_y));
1985
1986	box = region_rects(dstRegion);
1987	nbox = region_num_rects(dstRegion);
1988	while (nbox--) {
1989		DBG(("%s: dst=(%d, %d), (%d, %d); src=(%f, %f), (%f, %f)\n",
1990		     __FUNCTION__,
1991		     box->x1, box->y1,
1992		     box->x2, box->y2,
1993		     box->x1 * src_scale_x + src_offset_x,
1994		     box->y1 * src_scale_y + src_offset_y,
1995		     box->x2 * src_scale_x + src_offset_x,
1996		     box->y2 * src_scale_y + src_offset_y));
1997
1998		gen7_get_rectangles(sna, &tmp, 1, gen7_emit_video_state);
1999
2000		OUT_VERTEX(box->x2, box->y2);
2001		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
2002		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
2003
2004		OUT_VERTEX(box->x1, box->y2);
2005		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
2006		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
2007
2008		OUT_VERTEX(box->x1, box->y1);
2009		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
2010		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
2011
2012		box++;
2013	}
2014	gen4_vertex_flush(sna);
2015
2016	if (!DAMAGE_IS_ALL(priv->gpu_damage))
2017		sna_damage_add(&priv->gpu_damage, dstRegion);
2018
2019	return true;
2020}
2021
2022static int
2023gen7_composite_picture(struct sna *sna,
2024		       PicturePtr picture,
2025		       struct sna_composite_channel *channel,
2026		       int x, int y,
2027		       int w, int h,
2028		       int dst_x, int dst_y,
2029		       bool precise)
2030{
2031	PixmapPtr pixmap;
2032	uint32_t color;
2033	int16_t dx, dy;
2034
2035	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
2036	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
2037
2038	channel->is_solid = false;
2039	channel->card_format = -1;
2040
2041	if (sna_picture_is_solid(picture, &color))
2042		return gen4_channel_init_solid(sna, channel, color);
2043
2044	if (picture->pDrawable == NULL) {
2045		int ret;
2046
2047		if (picture->pSourcePict->type == SourcePictTypeLinear)
2048			return gen4_channel_init_linear(sna, picture, channel,
2049							x, y,
2050							w, h,
2051							dst_x, dst_y);
2052
2053		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
2054		ret = -1;
2055		if (!precise)
2056			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
2057								      x, y, w, h, dst_x, dst_y);
2058		if (ret == -1)
2059			ret = sna_render_picture_fixup(sna, picture, channel,
2060						       x, y, w, h, dst_x, dst_y);
2061		return ret;
2062	}
2063
2064	if (picture->alphaMap) {
2065		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
2066		return sna_render_picture_fixup(sna, picture, channel,
2067						x, y, w, h, dst_x, dst_y);
2068	}
2069
2070	if (!gen7_check_repeat(picture))
2071		return sna_render_picture_fixup(sna, picture, channel,
2072						x, y, w, h, dst_x, dst_y);
2073
2074	if (!gen7_check_filter(picture))
2075		return sna_render_picture_fixup(sna, picture, channel,
2076						x, y, w, h, dst_x, dst_y);
2077
2078	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
2079	channel->filter = picture->filter;
2080
2081	assert(picture->pDrawable);
2082	pixmap = get_drawable_pixmap(picture->pDrawable);
2083	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
2084
2085	x += dx + picture->pDrawable->x;
2086	y += dy + picture->pDrawable->y;
2087
2088	channel->is_affine = sna_transform_is_affine(picture->transform);
2089	if (sna_transform_is_imprecise_integer_translation(picture->transform, picture->filter, precise, &dx, &dy)) {
2090		DBG(("%s: integer translation (%d, %d), removing\n",
2091		     __FUNCTION__, dx, dy));
2092		x += dx;
2093		y += dy;
2094		channel->transform = NULL;
2095		channel->filter = PictFilterNearest;
2096
2097		if (channel->repeat ||
2098		    (x >= 0 &&
2099		     y >= 0 &&
2100		     x + w <= pixmap->drawable.width &&
2101		     y + h <= pixmap->drawable.height)) {
2102			struct sna_pixmap *priv = sna_pixmap(pixmap);
2103			if (priv && priv->clear) {
2104				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
2105				return gen4_channel_init_solid(sna, channel,
2106							       solid_color(picture->format, priv->clear_color));
2107			}
2108		}
2109	} else
2110		channel->transform = picture->transform;
2111
2112	channel->pict_format = picture->format;
2113	channel->card_format = gen7_get_card_format(picture->format);
2114	if (channel->card_format == (unsigned)-1)
2115		return sna_render_picture_convert(sna, picture, channel, pixmap,
2116						  x, y, w, h, dst_x, dst_y,
2117						  false);
2118
2119	if (too_large(pixmap->drawable.width, pixmap->drawable.height)) {
2120		DBG(("%s: extracting from pixmap %dx%d\n", __FUNCTION__,
2121		     pixmap->drawable.width, pixmap->drawable.height));
2122		return sna_render_picture_extract(sna, picture, channel,
2123						  x, y, w, h, dst_x, dst_y);
2124	}
2125
2126	DBG(("%s: pixmap, repeat=%d, filter=%d, transform?=%d [affine? %d], format=%08x\n",
2127	     __FUNCTION__,
2128	     channel->repeat, channel->filter,
2129	     channel->transform != NULL, channel->is_affine,
2130	     channel->pict_format));
2131	if (channel->transform) {
2132		DBG(("%s: transform=[%f %f %f, %f %f %f, %f %f %f]\n",
2133		     __FUNCTION__,
2134		     channel->transform->matrix[0][0] / 65536.,
2135		     channel->transform->matrix[0][1] / 65536.,
2136		     channel->transform->matrix[0][2] / 65536.,
2137		     channel->transform->matrix[1][0] / 65536.,
2138		     channel->transform->matrix[1][1] / 65536.,
2139		     channel->transform->matrix[1][2] / 65536.,
2140		     channel->transform->matrix[2][0] / 65536.,
2141		     channel->transform->matrix[2][1] / 65536.,
2142		     channel->transform->matrix[2][2] / 65536.));
2143	}
2144
2145	return sna_render_pixmap_bo(sna, channel, pixmap,
2146				    x, y, w, h, dst_x, dst_y);
2147}
2148
2149inline static void gen7_composite_channel_convert(struct sna_composite_channel *channel)
2150{
2151	channel->repeat = gen7_repeat(channel->repeat);
2152	channel->filter = gen7_filter(channel->filter);
2153	if (channel->card_format == (unsigned)-1)
2154		channel->card_format = gen7_get_card_format(channel->pict_format);
2155	assert(channel->card_format != (unsigned)-1);
2156}
2157
2158static void gen7_render_composite_done(struct sna *sna,
2159				       const struct sna_composite_op *op)
2160{
2161	if (sna->render.vertex_offset) {
2162		gen4_vertex_flush(sna);
2163		gen7_magic_ca_pass(sna, op);
2164	}
2165
2166	if (op->mask.bo)
2167		kgem_bo_destroy(&sna->kgem, op->mask.bo);
2168	if (op->src.bo)
2169		kgem_bo_destroy(&sna->kgem, op->src.bo);
2170
2171	sna_render_composite_redirect_done(sna, op);
2172}
2173
2174inline static bool
2175gen7_composite_set_target(struct sna *sna,
2176			  struct sna_composite_op *op,
2177			  PicturePtr dst,
2178			  int x, int y, int w, int h,
2179			  bool partial)
2180{
2181	BoxRec box;
2182	unsigned int hint;
2183
2184	DBG(("%s: (%d, %d)x(%d, %d), partial?=%d\n", __FUNCTION__, x, y, w, h, partial));
2185
2186	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
2187	op->dst.format = dst->format;
2188	op->dst.width  = op->dst.pixmap->drawable.width;
2189	op->dst.height = op->dst.pixmap->drawable.height;
2190
2191	if (w | h) {
2192		assert(w && h);
2193		box.x1 = x;
2194		box.y1 = y;
2195		box.x2 = x + w;
2196		box.y2 = y + h;
2197	} else
2198		sna_render_picture_extents(dst, &box);
2199
2200	hint = PREFER_GPU | RENDER_GPU;
2201	if (!need_tiling(sna, op->dst.width, op->dst.height))
2202		hint |= FORCE_GPU;
2203	if (!partial) {
2204		hint |= IGNORE_DAMAGE;
2205		if (w == op->dst.width && h == op->dst.height)
2206			hint |= REPLACES;
2207	}
2208
2209	op->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint, &box, &op->damage);
2210	if (op->dst.bo == NULL)
2211		return false;
2212
2213	if (hint & REPLACES) {
2214		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
2215		kgem_bo_pair_undo(&sna->kgem, priv->gpu_bo, priv->cpu_bo);
2216	}
2217
2218	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
2219			    &op->dst.x, &op->dst.y);
2220
2221	DBG(("%s: pixmap=%ld, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
2222	     __FUNCTION__,
2223	     op->dst.pixmap->drawable.serialNumber, (int)op->dst.format,
2224	     op->dst.width, op->dst.height,
2225	     op->dst.bo->pitch,
2226	     op->dst.x, op->dst.y,
2227	     op->damage ? *op->damage : (void *)-1));
2228
2229	assert(op->dst.bo->proxy == NULL);
2230
2231	if (too_large(op->dst.width, op->dst.height) &&
2232	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
2233		return false;
2234
2235	return true;
2236}
2237
2238static bool
2239try_blt(struct sna *sna,
2240	uint8_t op,
2241	PicturePtr src,
2242	PicturePtr mask,
2243	PicturePtr dst,
2244	int16_t src_x, int16_t src_y,
2245	int16_t msk_x, int16_t msk_y,
2246	int16_t dst_x, int16_t dst_y,
2247	int16_t width, int16_t height,
2248	unsigned flags,
2249	struct sna_composite_op *tmp)
2250{
2251	struct kgem_bo *bo;
2252
2253	if (sna->kgem.mode == KGEM_BLT) {
2254		DBG(("%s: already performing BLT\n", __FUNCTION__));
2255		goto execute;
2256	}
2257
2258	if (too_large(width, height)) {
2259		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
2260		     __FUNCTION__, width, height));
2261		goto execute;
2262	}
2263
2264	bo = __sna_drawable_peek_bo(dst->pDrawable);
2265	if (bo == NULL)
2266		goto execute;
2267
2268	if (untiled_tlb_miss(bo))
2269		goto execute;
2270
2271	if (bo->rq) {
2272		if (RQ_IS_BLT(bo->rq))
2273			goto execute;
2274
2275		return false;
2276	}
2277
2278	if (bo->tiling == I915_TILING_Y)
2279		goto upload;
2280
2281	if (src->pDrawable == dst->pDrawable &&
2282	    (sna->render_state.gt < 3 || width*height < 1024) &&
2283	    can_switch_to_blt(sna, bo, 0))
2284		goto execute;
2285
2286	if (sna_picture_is_solid(src, NULL) && can_switch_to_blt(sna, bo, 0))
2287		goto execute;
2288
2289	if (src->pDrawable) {
2290		struct kgem_bo *s = __sna_drawable_peek_bo(src->pDrawable);
2291		if (s == NULL)
2292			goto upload;
2293
2294		if (prefer_blt_bo(sna, s, bo))
2295			goto execute;
2296	}
2297
2298	if (sna->kgem.ring == KGEM_BLT) {
2299		DBG(("%s: already performing BLT\n", __FUNCTION__));
2300		goto execute;
2301	}
2302
2303upload:
2304	flags |= COMPOSITE_UPLOAD;
2305execute:
2306	return sna_blt_composite(sna, op,
2307				 src, dst,
2308				 src_x, src_y,
2309				 dst_x, dst_y,
2310				 width, height,
2311				 flags, tmp);
2312}
2313
2314static bool
2315check_gradient(PicturePtr picture, bool precise)
2316{
2317	if (picture->pDrawable)
2318		return false;
2319
2320	switch (picture->pSourcePict->type) {
2321	case SourcePictTypeSolidFill:
2322	case SourcePictTypeLinear:
2323		return false;
2324	default:
2325		return precise;
2326	}
2327}
2328
2329static bool
2330has_alphamap(PicturePtr p)
2331{
2332	return p->alphaMap != NULL;
2333}
2334
2335static bool
2336need_upload(PicturePtr p)
2337{
2338	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
2339}
2340
2341static bool
2342source_is_busy(PixmapPtr pixmap)
2343{
2344	struct sna_pixmap *priv = sna_pixmap(pixmap);
2345	if (priv == NULL || priv->clear)
2346		return false;
2347
2348	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
2349		return true;
2350
2351	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
2352		return true;
2353
2354	return priv->gpu_damage && !priv->cpu_damage;
2355}
2356
2357static bool
2358source_fallback(PicturePtr p, PixmapPtr pixmap, bool precise)
2359{
2360	if (sna_picture_is_solid(p, NULL))
2361		return false;
2362
2363	if (p->pSourcePict)
2364		return check_gradient(p, precise);
2365
2366	if (!gen7_check_repeat(p) || !gen7_check_format(p->format))
2367		return true;
2368
2369	if (pixmap && source_is_busy(pixmap))
2370		return false;
2371
2372	return has_alphamap(p) || !gen7_check_filter(p) || need_upload(p);
2373}
2374
2375static bool
2376gen7_composite_fallback(struct sna *sna,
2377			PicturePtr src,
2378			PicturePtr mask,
2379			PicturePtr dst)
2380{
2381	PixmapPtr src_pixmap;
2382	PixmapPtr mask_pixmap;
2383	PixmapPtr dst_pixmap;
2384	bool src_fallback, mask_fallback;
2385
2386	if (!gen7_check_dst_format(dst->format)) {
2387		DBG(("%s: unknown destination format: %d\n",
2388		     __FUNCTION__, dst->format));
2389		return true;
2390	}
2391
2392	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
2393
2394	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
2395	src_fallback = source_fallback(src, src_pixmap,
2396				       dst->polyMode == PolyModePrecise);
2397
2398	if (mask) {
2399		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
2400		mask_fallback = source_fallback(mask, mask_pixmap,
2401						dst->polyMode == PolyModePrecise);
2402	} else {
2403		mask_pixmap = NULL;
2404		mask_fallback = false;
2405	}
2406
2407	/* If we are using the destination as a source and need to
2408	 * readback in order to upload the source, do it all
2409	 * on the cpu.
2410	 */
2411	if (src_pixmap == dst_pixmap && src_fallback) {
2412		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
2413		return true;
2414	}
2415	if (mask_pixmap == dst_pixmap && mask_fallback) {
2416		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
2417		return true;
2418	}
2419
2420	/* If anything is on the GPU, push everything out to the GPU */
2421	if (dst_use_gpu(dst_pixmap)) {
2422		DBG(("%s: dst is already on the GPU, try to use GPU\n",
2423		     __FUNCTION__));
2424		return false;
2425	}
2426
2427	if (src_pixmap && !src_fallback) {
2428		DBG(("%s: src is already on the GPU, try to use GPU\n",
2429		     __FUNCTION__));
2430		return false;
2431	}
2432	if (mask_pixmap && !mask_fallback) {
2433		DBG(("%s: mask is already on the GPU, try to use GPU\n",
2434		     __FUNCTION__));
2435		return false;
2436	}
2437
2438	/* However if the dst is not on the GPU and we need to
2439	 * render one of the sources using the CPU, we may
2440	 * as well do the entire operation in place onthe CPU.
2441	 */
2442	if (src_fallback) {
2443		DBG(("%s: dst is on the CPU and src will fallback\n",
2444		     __FUNCTION__));
2445		return true;
2446	}
2447
2448	if (mask && mask_fallback) {
2449		DBG(("%s: dst is on the CPU and mask will fallback\n",
2450		     __FUNCTION__));
2451		return true;
2452	}
2453
2454	if (too_large(dst_pixmap->drawable.width,
2455		      dst_pixmap->drawable.height) &&
2456	    dst_is_cpu(dst_pixmap)) {
2457		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
2458		return true;
2459	}
2460
2461	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
2462	     __FUNCTION__));
2463	return dst_use_cpu(dst_pixmap);
2464}
2465
2466static int
2467reuse_source(struct sna *sna,
2468	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
2469	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
2470{
2471	uint32_t color;
2472
2473	if (src_x != msk_x || src_y != msk_y)
2474		return false;
2475
2476	if (src == mask) {
2477		DBG(("%s: mask is source\n", __FUNCTION__));
2478		*mc = *sc;
2479		mc->bo = kgem_bo_reference(mc->bo);
2480		return true;
2481	}
2482
2483	if (sna_picture_is_solid(mask, &color))
2484		return gen4_channel_init_solid(sna, mc, color);
2485
2486	if (sc->is_solid)
2487		return false;
2488
2489	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
2490		return false;
2491
2492	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
2493
2494	if (!sna_transform_equal(src->transform, mask->transform))
2495		return false;
2496
2497	if (!sna_picture_alphamap_equal(src, mask))
2498		return false;
2499
2500	if (!gen7_check_repeat(mask))
2501		return false;
2502
2503	if (!gen7_check_filter(mask))
2504		return false;
2505
2506	if (!gen7_check_format(mask->format))
2507		return false;
2508
2509	DBG(("%s: reusing source channel for mask with a twist\n",
2510	     __FUNCTION__));
2511
2512	*mc = *sc;
2513	mc->repeat = gen7_repeat(mask->repeat ? mask->repeatType : RepeatNone);
2514	mc->filter = gen7_filter(mask->filter);
2515	mc->pict_format = mask->format;
2516	mc->card_format = gen7_get_card_format(mask->format);
2517	mc->bo = kgem_bo_reference(mc->bo);
2518	return true;
2519}
2520
2521static bool
2522gen7_render_composite(struct sna *sna,
2523		      uint8_t op,
2524		      PicturePtr src,
2525		      PicturePtr mask,
2526		      PicturePtr dst,
2527		      int16_t src_x, int16_t src_y,
2528		      int16_t msk_x, int16_t msk_y,
2529		      int16_t dst_x, int16_t dst_y,
2530		      int16_t width, int16_t height,
2531		      unsigned flags,
2532		      struct sna_composite_op *tmp)
2533{
2534	if (op >= ARRAY_SIZE(gen7_blend_op))
2535		return false;
2536
2537	DBG(("%s: %dx%d, current mode=%d/%d\n", __FUNCTION__,
2538	     width, height, sna->kgem.mode, sna->kgem.ring));
2539
2540	if (mask == NULL &&
2541	    try_blt(sna, op,
2542		    src, mask, dst,
2543		    src_x, src_y,
2544		    msk_x, msk_y,
2545		    dst_x, dst_y,
2546		    width, height,
2547		    flags, tmp))
2548		return true;
2549
2550	if (gen7_composite_fallback(sna, src, mask, dst))
2551		goto fallback;
2552
2553	if (need_tiling(sna, width, height))
2554		return sna_tiling_composite(op, src, mask, dst,
2555					    src_x, src_y,
2556					    msk_x, msk_y,
2557					    dst_x, dst_y,
2558					    width, height,
2559					    tmp);
2560
2561	if (op == PictOpClear && src == sna->clear)
2562		op = PictOpSrc;
2563	tmp->op = op;
2564	if (!gen7_composite_set_target(sna, tmp, dst,
2565				       dst_x, dst_y, width, height,
2566				       flags & COMPOSITE_PARTIAL || op > PictOpSrc))
2567		goto fallback;
2568
2569	switch (gen7_composite_picture(sna, src, &tmp->src,
2570				       src_x, src_y,
2571				       width, height,
2572				       dst_x, dst_y,
2573				       dst->polyMode == PolyModePrecise)) {
2574	case -1:
2575		goto cleanup_dst;
2576	case 0:
2577		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
2578			goto cleanup_dst;
2579		/* fall through to fixup */
2580	case 1:
2581		/* Did we just switch rings to prepare the source? */
2582		if (mask == NULL &&
2583		    prefer_blt_composite(sna, tmp) &&
2584		    sna_blt_composite__convert(sna,
2585					       dst_x, dst_y, width, height,
2586					       tmp))
2587			return true;
2588
2589		gen7_composite_channel_convert(&tmp->src);
2590		break;
2591	}
2592
2593	tmp->is_affine = tmp->src.is_affine;
2594	tmp->has_component_alpha = false;
2595	tmp->need_magic_ca_pass = false;
2596
2597	tmp->mask.bo = NULL;
2598	tmp->mask.filter = SAMPLER_FILTER_NEAREST;
2599	tmp->mask.repeat = SAMPLER_EXTEND_NONE;
2600
2601	if (mask) {
2602		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
2603			tmp->has_component_alpha = true;
2604
2605			/* Check if it's component alpha that relies on a source alpha and on
2606			 * the source value.  We can only get one of those into the single
2607			 * source value that we get to blend with.
2608			 */
2609			if (gen7_blend_op[op].src_alpha &&
2610			    (gen7_blend_op[op].src_blend != GEN7_BLENDFACTOR_ZERO)) {
2611				if (op != PictOpOver)
2612					goto cleanup_src;
2613
2614				tmp->need_magic_ca_pass = true;
2615				tmp->op = PictOpOutReverse;
2616			}
2617		}
2618
2619		if (!reuse_source(sna,
2620				  src, &tmp->src, src_x, src_y,
2621				  mask, &tmp->mask, msk_x, msk_y)) {
2622			switch (gen7_composite_picture(sna, mask, &tmp->mask,
2623						       msk_x, msk_y,
2624						       width, height,
2625						       dst_x, dst_y,
2626						       dst->polyMode == PolyModePrecise)) {
2627			case -1:
2628				goto cleanup_src;
2629			case 0:
2630				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
2631					goto cleanup_src;
2632				/* fall through to fixup */
2633			case 1:
2634				gen7_composite_channel_convert(&tmp->mask);
2635				break;
2636			}
2637		}
2638
2639		tmp->is_affine &= tmp->mask.is_affine;
2640	}
2641
2642	tmp->u.gen7.flags =
2643		GEN7_SET_FLAGS(SAMPLER_OFFSET(tmp->src.filter,
2644					      tmp->src.repeat,
2645					      tmp->mask.filter,
2646					      tmp->mask.repeat),
2647			       gen7_get_blend(tmp->op,
2648					      tmp->has_component_alpha,
2649					      tmp->dst.format),
2650			       gen7_choose_composite_kernel(tmp->op,
2651							    tmp->mask.bo != NULL,
2652							    tmp->has_component_alpha,
2653							    tmp->is_affine),
2654			       gen4_choose_composite_emitter(sna, tmp));
2655
2656	tmp->blt   = gen7_render_composite_blt;
2657	tmp->box   = gen7_render_composite_box;
2658	tmp->boxes = gen7_render_composite_boxes__blt;
2659	if (tmp->emit_boxes){
2660		tmp->boxes = gen7_render_composite_boxes;
2661		tmp->thread_boxes = gen7_render_composite_boxes__thread;
2662	}
2663	tmp->done  = gen7_render_composite_done;
2664
2665	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
2666	if (!kgem_check_bo(&sna->kgem,
2667			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2668			   NULL)) {
2669		kgem_submit(&sna->kgem);
2670		if (!kgem_check_bo(&sna->kgem,
2671				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2672				   NULL))
2673			goto cleanup_mask;
2674		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
2675	}
2676
2677	gen7_align_vertex(sna, tmp);
2678	gen7_emit_composite_state(sna, tmp);
2679	return true;
2680
2681cleanup_mask:
2682	if (tmp->mask.bo) {
2683		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
2684		tmp->mask.bo = NULL;
2685	}
2686cleanup_src:
2687	if (tmp->src.bo) {
2688		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
2689		tmp->src.bo = NULL;
2690	}
2691cleanup_dst:
2692	if (tmp->redirect.real_bo) {
2693		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
2694		tmp->redirect.real_bo = NULL;
2695	}
2696fallback:
2697	return (mask == NULL &&
2698		sna_blt_composite(sna, op,
2699				  src, dst,
2700				  src_x, src_y,
2701				  dst_x, dst_y,
2702				  width, height,
2703				  flags | COMPOSITE_FALLBACK, tmp));
2704}
2705
2706#if !NO_COMPOSITE_SPANS
2707fastcall static void
2708gen7_render_composite_spans_box(struct sna *sna,
2709				const struct sna_composite_spans_op *op,
2710				const BoxRec *box, float opacity)
2711{
2712	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
2713	     __FUNCTION__,
2714	     op->base.src.offset[0], op->base.src.offset[1],
2715	     opacity,
2716	     op->base.dst.x, op->base.dst.y,
2717	     box->x1, box->y1,
2718	     box->x2 - box->x1,
2719	     box->y2 - box->y1));
2720
2721	gen7_get_rectangles(sna, &op->base, 1, gen7_emit_composite_state);
2722	op->prim_emit(sna, op, box, opacity);
2723}
2724
2725static void
2726gen7_render_composite_spans_boxes(struct sna *sna,
2727				  const struct sna_composite_spans_op *op,
2728				  const BoxRec *box, int nbox,
2729				  float opacity)
2730{
2731	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
2732	     __FUNCTION__, nbox,
2733	     op->base.src.offset[0], op->base.src.offset[1],
2734	     opacity,
2735	     op->base.dst.x, op->base.dst.y));
2736
2737	do {
2738		int nbox_this_time;
2739
2740		nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox,
2741						     gen7_emit_composite_state);
2742		nbox -= nbox_this_time;
2743
2744		do {
2745			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
2746			     box->x1, box->y1,
2747			     box->x2 - box->x1,
2748			     box->y2 - box->y1));
2749
2750			op->prim_emit(sna, op, box++, opacity);
2751		} while (--nbox_this_time);
2752	} while (nbox);
2753}
2754
2755fastcall static void
2756gen7_render_composite_spans_boxes__thread(struct sna *sna,
2757					  const struct sna_composite_spans_op *op,
2758					  const struct sna_opacity_box *box,
2759					  int nbox)
2760{
2761	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
2762	     __FUNCTION__, nbox,
2763	     op->base.src.offset[0], op->base.src.offset[1],
2764	     op->base.dst.x, op->base.dst.y));
2765
2766	sna_vertex_lock(&sna->render);
2767	do {
2768		int nbox_this_time;
2769		float *v;
2770
2771		nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox,
2772						     gen7_emit_composite_state);
2773		assert(nbox_this_time);
2774		nbox -= nbox_this_time;
2775
2776		v = sna->render.vertices + sna->render.vertex_used;
2777		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
2778
2779		sna_vertex_acquire__locked(&sna->render);
2780		sna_vertex_unlock(&sna->render);
2781
2782		op->emit_boxes(op, box, nbox_this_time, v);
2783		box += nbox_this_time;
2784
2785		sna_vertex_lock(&sna->render);
2786		sna_vertex_release__locked(&sna->render);
2787	} while (nbox);
2788	sna_vertex_unlock(&sna->render);
2789}
2790
2791fastcall static void
2792gen7_render_composite_spans_done(struct sna *sna,
2793				 const struct sna_composite_spans_op *op)
2794{
2795	if (sna->render.vertex_offset)
2796		gen4_vertex_flush(sna);
2797
2798	DBG(("%s()\n", __FUNCTION__));
2799
2800	if (op->base.src.bo)
2801		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2802
2803	sna_render_composite_redirect_done(sna, &op->base);
2804}
2805
2806static bool
2807gen7_check_composite_spans(struct sna *sna,
2808			   uint8_t op, PicturePtr src, PicturePtr dst,
2809			   int16_t width, int16_t height, unsigned flags)
2810{
2811	if (op >= ARRAY_SIZE(gen7_blend_op))
2812		return false;
2813
2814	if (gen7_composite_fallback(sna, src, NULL, dst))
2815		return false;
2816
2817	if (need_tiling(sna, width, height) &&
2818	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
2819		DBG(("%s: fallback, tiled operation not on GPU\n",
2820		     __FUNCTION__));
2821		return false;
2822	}
2823
2824	return true;
2825}
2826
2827static bool
2828gen7_render_composite_spans(struct sna *sna,
2829			    uint8_t op,
2830			    PicturePtr src,
2831			    PicturePtr dst,
2832			    int16_t src_x,  int16_t src_y,
2833			    int16_t dst_x,  int16_t dst_y,
2834			    int16_t width,  int16_t height,
2835			    unsigned flags,
2836			    struct sna_composite_spans_op *tmp)
2837{
2838	DBG(("%s: %dx%d with flags=%x, current mode=%d/%d\n", __FUNCTION__,
2839	     width, height, flags, sna->kgem.mode, sna->kgem.ring));
2840
2841	assert(gen7_check_composite_spans(sna, op, src, dst, width, height, flags));
2842
2843	if (need_tiling(sna, width, height)) {
2844		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
2845		     __FUNCTION__, width, height));
2846		return sna_tiling_composite_spans(op, src, dst,
2847						  src_x, src_y, dst_x, dst_y,
2848						  width, height, flags, tmp);
2849	}
2850
2851	tmp->base.op = op;
2852	if (!gen7_composite_set_target(sna, &tmp->base, dst,
2853				       dst_x, dst_y, width, height, true))
2854		return false;
2855
2856	switch (gen7_composite_picture(sna, src, &tmp->base.src,
2857				       src_x, src_y,
2858				       width, height,
2859				       dst_x, dst_y,
2860				       dst->polyMode == PolyModePrecise)) {
2861	case -1:
2862		goto cleanup_dst;
2863	case 0:
2864		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
2865			goto cleanup_dst;
2866		/* fall through to fixup */
2867	case 1:
2868		gen7_composite_channel_convert(&tmp->base.src);
2869		break;
2870	}
2871	tmp->base.mask.bo = NULL;
2872
2873	tmp->base.is_affine = tmp->base.src.is_affine;
2874	tmp->base.need_magic_ca_pass = false;
2875
2876	tmp->base.u.gen7.flags =
2877		GEN7_SET_FLAGS(SAMPLER_OFFSET(tmp->base.src.filter,
2878					      tmp->base.src.repeat,
2879					      SAMPLER_FILTER_NEAREST,
2880					      SAMPLER_EXTEND_PAD),
2881			       gen7_get_blend(tmp->base.op, false, tmp->base.dst.format),
2882			       GEN7_WM_KERNEL_OPACITY | !tmp->base.is_affine,
2883			       gen4_choose_spans_emitter(sna, tmp));
2884
2885	tmp->box   = gen7_render_composite_spans_box;
2886	tmp->boxes = gen7_render_composite_spans_boxes;
2887	if (tmp->emit_boxes)
2888		tmp->thread_boxes = gen7_render_composite_spans_boxes__thread;
2889	tmp->done  = gen7_render_composite_spans_done;
2890
2891	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->base.dst.bo);
2892	if (!kgem_check_bo(&sna->kgem,
2893			   tmp->base.dst.bo, tmp->base.src.bo,
2894			   NULL)) {
2895		kgem_submit(&sna->kgem);
2896		if (!kgem_check_bo(&sna->kgem,
2897				   tmp->base.dst.bo, tmp->base.src.bo,
2898				   NULL))
2899			goto cleanup_src;
2900		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
2901	}
2902
2903	gen7_align_vertex(sna, &tmp->base);
2904	gen7_emit_composite_state(sna, &tmp->base);
2905	return true;
2906
2907cleanup_src:
2908	if (tmp->base.src.bo)
2909		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
2910cleanup_dst:
2911	if (tmp->base.redirect.real_bo)
2912		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
2913	return false;
2914}
2915#endif
2916
2917static void
2918gen7_emit_copy_state(struct sna *sna,
2919		     const struct sna_composite_op *op)
2920{
2921	uint32_t *binding_table;
2922	uint16_t offset, dirty;
2923
2924	gen7_get_batch(sna, op);
2925
2926	binding_table = gen7_composite_get_binding_table(sna, &offset);
2927
2928	dirty = kgem_bo_is_dirty(op->dst.bo);
2929
2930	binding_table[0] =
2931		gen7_bind_bo(sna,
2932			     op->dst.bo, op->dst.width, op->dst.height,
2933			     gen7_get_dest_format(op->dst.format),
2934			     true);
2935	binding_table[1] =
2936		gen7_bind_bo(sna,
2937			     op->src.bo, op->src.width, op->src.height,
2938			     op->src.card_format,
2939			     false);
2940
2941	if (sna->kgem.surface == offset &&
2942	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen7.surface_table) == *(uint64_t*)binding_table) {
2943		sna->kgem.surface += sizeof(struct gen7_surface_state) / sizeof(uint32_t);
2944		offset = sna->render_state.gen7.surface_table;
2945	}
2946
2947	if (sna->kgem.batch[sna->render_state.gen7.surface_table] == binding_table[0])
2948		dirty = 0;
2949
2950	assert(!GEN7_READS_DST(op->u.gen7.flags));
2951	gen7_emit_state(sna, op, offset | dirty);
2952}
2953
2954static inline bool
2955prefer_blt_copy(struct sna *sna,
2956		struct kgem_bo *src_bo,
2957		struct kgem_bo *dst_bo,
2958		unsigned flags)
2959{
2960	if (sna->kgem.mode == KGEM_BLT)
2961		return true;
2962
2963	assert((flags & COPY_SYNC) == 0);
2964
2965	if (untiled_tlb_miss(src_bo) ||
2966	    untiled_tlb_miss(dst_bo))
2967		return true;
2968
2969	if (flags & COPY_DRI && !sna->kgem.has_semaphores)
2970		return false;
2971
2972	if (force_blt_ring(sna, dst_bo))
2973		return true;
2974
2975	if ((flags & COPY_SMALL ||
2976	     (sna->render_state.gt < 3 && src_bo == dst_bo)) &&
2977            can_switch_to_blt(sna, dst_bo, flags))
2978		return true;
2979
2980	if (kgem_bo_is_render(dst_bo) ||
2981	    kgem_bo_is_render(src_bo))
2982		return false;
2983
2984	if (flags & COPY_LAST &&
2985	    sna->render_state.gt < 3 &&
2986            can_switch_to_blt(sna, dst_bo, flags))
2987		return true;
2988
2989	if (prefer_render_ring(sna, dst_bo))
2990		return false;
2991
2992	if (!prefer_blt_ring(sna, dst_bo, flags))
2993		return false;
2994
2995	return prefer_blt_bo(sna, src_bo, dst_bo);
2996}
2997
2998static bool
2999gen7_render_copy_boxes(struct sna *sna, uint8_t alu,
3000		       const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
3001		       const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
3002		       const BoxRec *box, int n, unsigned flags)
3003{
3004	struct sna_composite_op tmp;
3005	BoxRec extents;
3006
3007	DBG(("%s (%d, %d)->(%d, %d) x %d, alu=%x, flags=%x, self-copy=%d, overlaps? %d\n",
3008	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n, alu, flags,
3009	     src_bo == dst_bo,
3010	     overlaps(sna,
3011		      src_bo, src_dx, src_dy,
3012		      dst_bo, dst_dx, dst_dy,
3013		      box, n, flags, &extents)));
3014
3015	if (prefer_blt_copy(sna, src_bo, dst_bo, flags) &&
3016	    sna_blt_compare_depth(src, dst) &&
3017	    sna_blt_copy_boxes(sna, alu,
3018			       src_bo, src_dx, src_dy,
3019			       dst_bo, dst_dx, dst_dy,
3020			       dst->bitsPerPixel,
3021			       box, n))
3022		return true;
3023
3024	if (!(alu == GXcopy || alu == GXclear)) {
3025fallback_blt:
3026		DBG(("%s: fallback blt\n", __FUNCTION__));
3027		if (!sna_blt_compare_depth(src, dst))
3028			return false;
3029
3030		return sna_blt_copy_boxes_fallback(sna, alu,
3031						   src, src_bo, src_dx, src_dy,
3032						   dst, dst_bo, dst_dx, dst_dy,
3033						   box, n);
3034	}
3035
3036	if (overlaps(sna,
3037		     src_bo, src_dx, src_dy,
3038		     dst_bo, dst_dx, dst_dy,
3039		     box, n, flags,
3040		     &extents)) {
3041		bool big = too_large(extents.x2-extents.x1, extents.y2-extents.y1);
3042
3043		if ((big || !prefer_render_ring(sna, dst_bo)) &&
3044		    sna_blt_copy_boxes(sna, alu,
3045				       src_bo, src_dx, src_dy,
3046				       dst_bo, dst_dx, dst_dy,
3047				       dst->bitsPerPixel,
3048				       box, n))
3049			return true;
3050
3051		if (big)
3052			goto fallback_blt;
3053
3054		assert(src_bo == dst_bo);
3055		assert(src->depth == dst->depth);
3056		assert(src->width == dst->width);
3057		assert(src->height == dst->height);
3058		return sna_render_copy_boxes__overlap(sna, alu, dst, dst_bo,
3059						      src_dx, src_dy,
3060						      dst_dx, dst_dy,
3061						      box, n, &extents);
3062	}
3063
3064	if (dst->depth == src->depth) {
3065		tmp.dst.format = sna_render_format_for_depth(dst->depth);
3066		tmp.src.pict_format = tmp.dst.format;
3067	} else {
3068		tmp.dst.format = sna_format_for_depth(dst->depth);
3069		tmp.src.pict_format = sna_format_for_depth(src->depth);
3070	}
3071	if (!gen7_check_format(tmp.src.pict_format))
3072		goto fallback_blt;
3073
3074	tmp.dst.pixmap = (PixmapPtr)dst;
3075	tmp.dst.width  = dst->width;
3076	tmp.dst.height = dst->height;
3077	tmp.dst.bo = dst_bo;
3078	tmp.dst.x = tmp.dst.y = 0;
3079	tmp.damage = NULL;
3080
3081	sna_render_composite_redirect_init(&tmp);
3082	if (too_large(tmp.dst.width, tmp.dst.height)) {
3083		int i;
3084
3085		extents = box[0];
3086		for (i = 1; i < n; i++) {
3087			if (box[i].x1 < extents.x1)
3088				extents.x1 = box[i].x1;
3089			if (box[i].y1 < extents.y1)
3090				extents.y1 = box[i].y1;
3091
3092			if (box[i].x2 > extents.x2)
3093				extents.x2 = box[i].x2;
3094			if (box[i].y2 > extents.y2)
3095				extents.y2 = box[i].y2;
3096		}
3097
3098		if (!sna_render_composite_redirect(sna, &tmp,
3099						   extents.x1 + dst_dx,
3100						   extents.y1 + dst_dy,
3101						   extents.x2 - extents.x1,
3102						   extents.y2 - extents.y1,
3103						   n > 1))
3104			goto fallback_tiled;
3105	}
3106
3107	tmp.src.card_format = gen7_get_card_format(tmp.src.pict_format);
3108	if (too_large(src->width, src->height)) {
3109		int i;
3110
3111		extents = box[0];
3112		for (i = 1; i < n; i++) {
3113			if (box[i].x1 < extents.x1)
3114				extents.x1 = box[i].x1;
3115			if (box[i].y1 < extents.y1)
3116				extents.y1 = box[i].y1;
3117
3118			if (box[i].x2 > extents.x2)
3119				extents.x2 = box[i].x2;
3120			if (box[i].y2 > extents.y2)
3121				extents.y2 = box[i].y2;
3122		}
3123
3124		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
3125					       extents.x1 + src_dx,
3126					       extents.y1 + src_dy,
3127					       extents.x2 - extents.x1,
3128					       extents.y2 - extents.y1))
3129			goto fallback_tiled_dst;
3130	} else {
3131		tmp.src.bo = src_bo;
3132		tmp.src.width  = src->width;
3133		tmp.src.height = src->height;
3134		tmp.src.offset[0] = tmp.src.offset[1] = 0;
3135	}
3136
3137	tmp.mask.bo = NULL;
3138
3139	tmp.floats_per_vertex = 2;
3140	tmp.floats_per_rect = 6;
3141	tmp.need_magic_ca_pass = 0;
3142
3143	tmp.u.gen7.flags = COPY_FLAGS(alu);
3144
3145	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
3146	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
3147		kgem_submit(&sna->kgem);
3148		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
3149			if (tmp.src.bo != src_bo)
3150				kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3151			if (tmp.redirect.real_bo)
3152				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
3153			goto fallback_blt;
3154		}
3155		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3156	}
3157
3158	src_dx += tmp.src.offset[0];
3159	src_dy += tmp.src.offset[1];
3160
3161	dst_dx += tmp.dst.x;
3162	dst_dy += tmp.dst.y;
3163
3164	tmp.dst.x = tmp.dst.y = 0;
3165
3166	gen7_align_vertex(sna, &tmp);
3167	gen7_emit_copy_state(sna, &tmp);
3168
3169	do {
3170		int16_t *v;
3171		int n_this_time;
3172
3173		n_this_time = gen7_get_rectangles(sna, &tmp, n,
3174						  gen7_emit_copy_state);
3175		n -= n_this_time;
3176
3177		v = (int16_t *)(sna->render.vertices + sna->render.vertex_used);
3178		sna->render.vertex_used += 6 * n_this_time;
3179		assert(sna->render.vertex_used <= sna->render.vertex_size);
3180		do {
3181
3182			DBG(("	(%d, %d) -> (%d, %d) + (%d, %d)\n",
3183			     box->x1 + src_dx, box->y1 + src_dy,
3184			     box->x1 + dst_dx, box->y1 + dst_dy,
3185			     box->x2 - box->x1, box->y2 - box->y1));
3186			v[0] = box->x2 + dst_dx;
3187			v[2] = box->x2 + src_dx;
3188			v[1]  = v[5] = box->y2 + dst_dy;
3189			v[3]  = v[7] = box->y2 + src_dy;
3190			v[8]  = v[4] = box->x1 + dst_dx;
3191			v[10] = v[6] = box->x1 + src_dx;
3192			v[9]  = box->y1 + dst_dy;
3193			v[11] = box->y1 + src_dy;
3194			v += 12; box++;
3195		} while (--n_this_time);
3196	} while (n);
3197
3198	gen4_vertex_flush(sna);
3199	sna_render_composite_redirect_done(sna, &tmp);
3200	if (tmp.src.bo != src_bo)
3201		kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3202	return true;
3203
3204fallback_tiled_dst:
3205	if (tmp.redirect.real_bo)
3206		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
3207fallback_tiled:
3208	DBG(("%s: fallback tiled\n", __FUNCTION__));
3209	if (sna_blt_compare_depth(src, dst) &&
3210	    sna_blt_copy_boxes(sna, alu,
3211			       src_bo, src_dx, src_dy,
3212			       dst_bo, dst_dx, dst_dy,
3213			       dst->bitsPerPixel,
3214			       box, n))
3215		return true;
3216
3217	return sna_tiling_copy_boxes(sna, alu,
3218				     src, src_bo, src_dx, src_dy,
3219				     dst, dst_bo, dst_dx, dst_dy,
3220				     box, n);
3221}
3222
3223static void
3224gen7_render_copy_blt(struct sna *sna,
3225		     const struct sna_copy_op *op,
3226		     int16_t sx, int16_t sy,
3227		     int16_t w,  int16_t h,
3228		     int16_t dx, int16_t dy)
3229{
3230	int16_t *v;
3231
3232	gen7_get_rectangles(sna, &op->base, 1, gen7_emit_copy_state);
3233
3234	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3235	sna->render.vertex_used += 6;
3236	assert(sna->render.vertex_used <= sna->render.vertex_size);
3237
3238	v[0]  = dx+w; v[1]  = dy+h;
3239	v[2]  = sx+w; v[3]  = sy+h;
3240	v[4]  = dx;   v[5]  = dy+h;
3241	v[6]  = sx;   v[7]  = sy+h;
3242	v[8]  = dx;   v[9]  = dy;
3243	v[10] = sx;   v[11] = sy;
3244}
3245
3246static void
3247gen7_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
3248{
3249	if (sna->render.vertex_offset)
3250		gen4_vertex_flush(sna);
3251}
3252
3253static bool
3254gen7_render_copy(struct sna *sna, uint8_t alu,
3255		 PixmapPtr src, struct kgem_bo *src_bo,
3256		 PixmapPtr dst, struct kgem_bo *dst_bo,
3257		 struct sna_copy_op *op)
3258{
3259	DBG(("%s (alu=%d, src=(%dx%d), dst=(%dx%d))\n",
3260	     __FUNCTION__, alu,
3261	     src->drawable.width, src->drawable.height,
3262	     dst->drawable.width, dst->drawable.height));
3263
3264	if (prefer_blt_copy(sna, src_bo, dst_bo, 0) &&
3265	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
3266	    sna_blt_copy(sna, alu,
3267			 src_bo, dst_bo,
3268			 dst->drawable.bitsPerPixel,
3269			 op))
3270		return true;
3271
3272	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
3273	    too_large(src->drawable.width, src->drawable.height) ||
3274	    too_large(dst->drawable.width, dst->drawable.height)) {
3275fallback:
3276		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
3277			return false;
3278
3279		return sna_blt_copy(sna, alu, src_bo, dst_bo,
3280				    dst->drawable.bitsPerPixel,
3281				    op);
3282	}
3283
3284	if (dst->drawable.depth == src->drawable.depth) {
3285		op->base.dst.format = sna_render_format_for_depth(dst->drawable.depth);
3286		op->base.src.pict_format = op->base.dst.format;
3287	} else {
3288		op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
3289		op->base.src.pict_format = sna_format_for_depth(src->drawable.depth);
3290	}
3291	if (!gen7_check_format(op->base.src.pict_format))
3292		goto fallback;
3293
3294	op->base.dst.pixmap = dst;
3295	op->base.dst.width  = dst->drawable.width;
3296	op->base.dst.height = dst->drawable.height;
3297	op->base.dst.bo = dst_bo;
3298
3299	op->base.src.bo = src_bo;
3300	op->base.src.card_format =
3301		gen7_get_card_format(op->base.src.pict_format);
3302	op->base.src.width  = src->drawable.width;
3303	op->base.src.height = src->drawable.height;
3304
3305	op->base.mask.bo = NULL;
3306
3307	op->base.floats_per_vertex = 2;
3308	op->base.floats_per_rect = 6;
3309
3310	op->base.u.gen7.flags = COPY_FLAGS(alu);
3311
3312	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
3313	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
3314		kgem_submit(&sna->kgem);
3315		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
3316			goto fallback;
3317		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3318	}
3319
3320	gen7_align_vertex(sna, &op->base);
3321	gen7_emit_copy_state(sna, &op->base);
3322
3323	op->blt  = gen7_render_copy_blt;
3324	op->done = gen7_render_copy_done;
3325	return true;
3326}
3327
3328static void
3329gen7_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
3330{
3331	uint16_t dirty;
3332	uint32_t *binding_table;
3333	uint16_t offset;
3334
3335	/* XXX Render Target Fast Clear
3336	 * Set RTFC Enable in PS and render a rectangle.
3337	 * Limited to a clearing the full MSC surface only with a
3338	 * specific kernel.
3339	 */
3340
3341	gen7_get_batch(sna, op);
3342
3343	binding_table = gen7_composite_get_binding_table(sna, &offset);
3344
3345	dirty = kgem_bo_is_dirty(op->dst.bo);
3346
3347	binding_table[0] =
3348		gen7_bind_bo(sna,
3349			     op->dst.bo, op->dst.width, op->dst.height,
3350			     gen7_get_dest_format(op->dst.format),
3351			     true);
3352	binding_table[1] =
3353		gen7_bind_bo(sna,
3354			     op->src.bo, 1, 1,
3355			     GEN7_SURFACEFORMAT_B8G8R8A8_UNORM,
3356			     false);
3357
3358	if (sna->kgem.surface == offset &&
3359	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen7.surface_table) == *(uint64_t*)binding_table) {
3360		sna->kgem.surface +=
3361			sizeof(struct gen7_surface_state)/sizeof(uint32_t);
3362		offset = sna->render_state.gen7.surface_table;
3363	}
3364
3365	if (sna->kgem.batch[sna->render_state.gen7.surface_table] == binding_table[0])
3366		dirty = 0;
3367
3368	gen7_emit_state(sna, op, offset | dirty);
3369}
3370
3371static bool
3372gen7_render_fill_boxes(struct sna *sna,
3373		       CARD8 op,
3374		       PictFormat format,
3375		       const xRenderColor *color,
3376		       const DrawableRec *dst, struct kgem_bo *dst_bo,
3377		       const BoxRec *box, int n)
3378{
3379	struct sna_composite_op tmp;
3380	uint32_t pixel;
3381
3382	DBG(("%s (op=%d, color=(%04x, %04x, %04x, %04x) [%08x])\n",
3383	     __FUNCTION__, op,
3384	     color->red, color->green, color->blue, color->alpha, (int)format));
3385
3386	if (op >= ARRAY_SIZE(gen7_blend_op)) {
3387		DBG(("%s: fallback due to unhandled blend op: %d\n",
3388		     __FUNCTION__, op));
3389		return false;
3390	}
3391
3392	if (prefer_blt_fill(sna, dst_bo, FILL_BOXES) ||
3393	    !gen7_check_dst_format(format)) {
3394		uint8_t alu = GXinvalid;
3395
3396		if (op <= PictOpSrc) {
3397			pixel = 0;
3398			if (op == PictOpClear)
3399				alu = GXclear;
3400			else if (sna_get_pixel_from_rgba(&pixel,
3401							 color->red,
3402							 color->green,
3403							 color->blue,
3404							 color->alpha,
3405							 format))
3406				alu = GXcopy;
3407		}
3408
3409		if (alu != GXinvalid &&
3410		    sna_blt_fill_boxes(sna, alu,
3411				       dst_bo, dst->bitsPerPixel,
3412				       pixel, box, n))
3413			return true;
3414
3415		if (!gen7_check_dst_format(format))
3416			return false;
3417	}
3418
3419	if (op == PictOpClear) {
3420		pixel = 0;
3421		op = PictOpSrc;
3422	} else if (!sna_get_pixel_from_rgba(&pixel,
3423					    color->red,
3424					    color->green,
3425					    color->blue,
3426					    color->alpha,
3427					    PICT_a8r8g8b8))
3428		return false;
3429
3430	DBG(("%s(%08x x %d [(%d, %d), (%d, %d) ...])\n",
3431	     __FUNCTION__, pixel, n,
3432	     box[0].x1, box[0].y1, box[0].x2, box[0].y2));
3433
3434	tmp.dst.pixmap = (PixmapPtr)dst;
3435	tmp.dst.width  = dst->width;
3436	tmp.dst.height = dst->height;
3437	tmp.dst.format = format;
3438	tmp.dst.bo = dst_bo;
3439	tmp.dst.x = tmp.dst.y = 0;
3440	tmp.damage = NULL;
3441
3442	sna_render_composite_redirect_init(&tmp);
3443	if (too_large(dst->width, dst->height)) {
3444		BoxRec extents;
3445
3446		boxes_extents(box, n, &extents);
3447		if (!sna_render_composite_redirect(sna, &tmp,
3448						   extents.x1, extents.y1,
3449						   extents.x2 - extents.x1,
3450						   extents.y2 - extents.y1,
3451						   n > 1))
3452			return sna_tiling_fill_boxes(sna, op, format, color,
3453						     dst, dst_bo, box, n);
3454	}
3455
3456	tmp.src.bo = sna_render_get_solid(sna, pixel);
3457	tmp.mask.bo = NULL;
3458
3459	tmp.floats_per_vertex = 2;
3460	tmp.floats_per_rect = 6;
3461	tmp.need_magic_ca_pass = false;
3462
3463	tmp.u.gen7.flags = FILL_FLAGS(op, format);
3464
3465	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
3466	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3467		kgem_submit(&sna->kgem);
3468		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3469			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3470			if (tmp.redirect.real_bo)
3471				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
3472
3473			return false;
3474		}
3475		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3476	}
3477
3478	gen7_align_vertex(sna, &tmp);
3479	gen7_emit_fill_state(sna, &tmp);
3480
3481	do {
3482		int n_this_time;
3483		int16_t *v;
3484
3485		n_this_time = gen7_get_rectangles(sna, &tmp, n,
3486						  gen7_emit_fill_state);
3487		n -= n_this_time;
3488
3489		v = (int16_t *)(sna->render.vertices + sna->render.vertex_used);
3490		sna->render.vertex_used += 6 * n_this_time;
3491		assert(sna->render.vertex_used <= sna->render.vertex_size);
3492		do {
3493			DBG(("	(%d, %d), (%d, %d)\n",
3494			     box->x1, box->y1, box->x2, box->y2));
3495
3496			v[0] = box->x2;
3497			v[5] = v[1] = box->y2;
3498			v[8] = v[4] = box->x1;
3499			v[9] = box->y1;
3500			v[2] = v[3]  = v[7]  = 1;
3501			v[6] = v[10] = v[11] = 0;
3502			v += 12; box++;
3503		} while (--n_this_time);
3504	} while (n);
3505
3506	gen4_vertex_flush(sna);
3507	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3508	sna_render_composite_redirect_done(sna, &tmp);
3509	return true;
3510}
3511
3512static void
3513gen7_render_fill_op_blt(struct sna *sna,
3514			const struct sna_fill_op *op,
3515			int16_t x, int16_t y, int16_t w, int16_t h)
3516{
3517	int16_t *v;
3518
3519	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
3520
3521	gen7_get_rectangles(sna, &op->base, 1, gen7_emit_fill_state);
3522
3523	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3524	sna->render.vertex_used += 6;
3525	assert(sna->render.vertex_used <= sna->render.vertex_size);
3526
3527	v[0] = x+w;
3528	v[4] = v[8] = x;
3529	v[1] = v[5] = y+h;
3530	v[9] = y;
3531
3532	v[2] = v[3]  = v[7]  = 1;
3533	v[6] = v[10] = v[11] = 0;
3534}
3535
3536fastcall static void
3537gen7_render_fill_op_box(struct sna *sna,
3538			const struct sna_fill_op *op,
3539			const BoxRec *box)
3540{
3541	int16_t *v;
3542
3543	DBG(("%s: (%d, %d),(%d, %d)\n", __FUNCTION__,
3544	     box->x1, box->y1, box->x2, box->y2));
3545
3546	gen7_get_rectangles(sna, &op->base, 1, gen7_emit_fill_state);
3547
3548	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3549	sna->render.vertex_used += 6;
3550	assert(sna->render.vertex_used <= sna->render.vertex_size);
3551
3552	v[0] = box->x2;
3553	v[8] = v[4] = box->x1;
3554	v[5] = v[1] = box->y2;
3555	v[9] = box->y1;
3556
3557	v[7] = v[2]  = v[3]  = 1;
3558	v[6] = v[10] = v[11] = 0;
3559}
3560
3561fastcall static void
3562gen7_render_fill_op_boxes(struct sna *sna,
3563			  const struct sna_fill_op *op,
3564			  const BoxRec *box,
3565			  int nbox)
3566{
3567	DBG(("%s: (%d, %d),(%d, %d)... x %d\n", __FUNCTION__,
3568	     box->x1, box->y1, box->x2, box->y2, nbox));
3569
3570	do {
3571		int nbox_this_time;
3572		int16_t *v;
3573
3574		nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox,
3575						     gen7_emit_fill_state);
3576		nbox -= nbox_this_time;
3577
3578		v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3579		sna->render.vertex_used += 6 * nbox_this_time;
3580		assert(sna->render.vertex_used <= sna->render.vertex_size);
3581
3582		do {
3583			v[0] = box->x2;
3584			v[8] = v[4] = box->x1;
3585			v[5] = v[1] = box->y2;
3586			v[9] = box->y1;
3587			v[7] = v[2]  = v[3]  = 1;
3588			v[6] = v[10] = v[11] = 0;
3589			box++; v += 12;
3590		} while (--nbox_this_time);
3591	} while (nbox);
3592}
3593
3594static void
3595gen7_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
3596{
3597	if (sna->render.vertex_offset)
3598		gen4_vertex_flush(sna);
3599	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
3600}
3601
3602static bool
3603gen7_render_fill(struct sna *sna, uint8_t alu,
3604		 PixmapPtr dst, struct kgem_bo *dst_bo,
3605		 uint32_t color, unsigned flags,
3606		 struct sna_fill_op *op)
3607{
3608	DBG(("%s: (alu=%d, color=%x)\n", __FUNCTION__, alu, color));
3609
3610	if (prefer_blt_fill(sna, dst_bo, flags) &&
3611	    sna_blt_fill(sna, alu,
3612			 dst_bo, dst->drawable.bitsPerPixel,
3613			 color,
3614			 op))
3615		return true;
3616
3617	if (!(alu == GXcopy || alu == GXclear) ||
3618	    too_large(dst->drawable.width, dst->drawable.height))
3619		return sna_blt_fill(sna, alu,
3620				    dst_bo, dst->drawable.bitsPerPixel,
3621				    color,
3622				    op);
3623
3624	if (alu == GXclear)
3625		color = 0;
3626
3627	op->base.dst.pixmap = dst;
3628	op->base.dst.width  = dst->drawable.width;
3629	op->base.dst.height = dst->drawable.height;
3630	op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
3631	op->base.dst.bo = dst_bo;
3632	op->base.dst.x = op->base.dst.y = 0;
3633
3634	op->base.src.bo =
3635		sna_render_get_solid(sna,
3636				     sna_rgba_for_color(color,
3637							dst->drawable.depth));
3638	op->base.mask.bo = NULL;
3639
3640	op->base.need_magic_ca_pass = false;
3641	op->base.floats_per_vertex = 2;
3642	op->base.floats_per_rect = 6;
3643
3644	op->base.u.gen7.flags = FILL_FLAGS_NOBLEND;
3645
3646	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
3647	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3648		kgem_submit(&sna->kgem);
3649		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3650			kgem_bo_destroy(&sna->kgem, op->base.src.bo);
3651			return false;
3652		}
3653
3654		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3655	}
3656
3657	gen7_align_vertex(sna, &op->base);
3658	gen7_emit_fill_state(sna, &op->base);
3659
3660	op->blt   = gen7_render_fill_op_blt;
3661	op->box   = gen7_render_fill_op_box;
3662	op->boxes = gen7_render_fill_op_boxes;
3663	op->points = NULL;
3664	op->done  = gen7_render_fill_op_done;
3665	return true;
3666}
3667
3668static bool
3669gen7_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3670			     uint32_t color,
3671			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
3672			     uint8_t alu)
3673{
3674	BoxRec box;
3675
3676	box.x1 = x1;
3677	box.y1 = y1;
3678	box.x2 = x2;
3679	box.y2 = y2;
3680
3681	return sna_blt_fill_boxes(sna, alu,
3682				  bo, dst->drawable.bitsPerPixel,
3683				  color, &box, 1);
3684}
3685
3686static bool
3687gen7_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3688		     uint32_t color,
3689		     int16_t x1, int16_t y1,
3690		     int16_t x2, int16_t y2,
3691		     uint8_t alu)
3692{
3693	struct sna_composite_op tmp;
3694	int16_t *v;
3695
3696	/* Prefer to use the BLT if already engaged */
3697	if (prefer_blt_fill(sna, bo, FILL_BOXES) &&
3698	    gen7_render_fill_one_try_blt(sna, dst, bo, color,
3699					 x1, y1, x2, y2, alu))
3700		return true;
3701
3702	/* Must use the BLT if we can't RENDER... */
3703	if (!(alu == GXcopy || alu == GXclear) ||
3704	    too_large(dst->drawable.width, dst->drawable.height))
3705		return gen7_render_fill_one_try_blt(sna, dst, bo, color,
3706						    x1, y1, x2, y2, alu);
3707
3708	if (alu == GXclear)
3709		color = 0;
3710
3711	tmp.dst.pixmap = dst;
3712	tmp.dst.width  = dst->drawable.width;
3713	tmp.dst.height = dst->drawable.height;
3714	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
3715	tmp.dst.bo = bo;
3716	tmp.dst.x = tmp.dst.y = 0;
3717
3718	tmp.src.bo =
3719		sna_render_get_solid(sna,
3720				     sna_rgba_for_color(color,
3721							dst->drawable.depth));
3722	tmp.mask.bo = NULL;
3723
3724	tmp.floats_per_vertex = 2;
3725	tmp.floats_per_rect = 6;
3726	tmp.need_magic_ca_pass = false;
3727
3728	tmp.u.gen7.flags = FILL_FLAGS_NOBLEND;
3729
3730	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
3731	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3732		kgem_submit(&sna->kgem);
3733		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3734			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3735			return false;
3736		}
3737		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3738	}
3739
3740	gen7_align_vertex(sna, &tmp);
3741	gen7_emit_fill_state(sna, &tmp);
3742
3743	gen7_get_rectangles(sna, &tmp, 1, gen7_emit_fill_state);
3744
3745	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
3746
3747	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3748	sna->render.vertex_used += 6;
3749	assert(sna->render.vertex_used <= sna->render.vertex_size);
3750
3751	v[0] = x2;
3752	v[8] = v[4] = x1;
3753	v[5] = v[1] = y2;
3754	v[9] = y1;
3755	v[7] = v[2]  = v[3]  = 1;
3756	v[6] = v[10] = v[11] = 0;
3757
3758	gen4_vertex_flush(sna);
3759	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3760
3761	return true;
3762}
3763
3764static bool
3765gen7_render_clear_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
3766{
3767	BoxRec box;
3768
3769	box.x1 = 0;
3770	box.y1 = 0;
3771	box.x2 = dst->drawable.width;
3772	box.y2 = dst->drawable.height;
3773
3774	return sna_blt_fill_boxes(sna, GXclear,
3775				  bo, dst->drawable.bitsPerPixel,
3776				  0, &box, 1);
3777}
3778
3779static bool
3780gen7_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
3781{
3782	struct sna_composite_op tmp;
3783	int16_t *v;
3784
3785	DBG(("%s: %dx%d\n",
3786	     __FUNCTION__,
3787	     dst->drawable.width,
3788	     dst->drawable.height));
3789
3790	/* Prefer to use the BLT if already engaged */
3791	if (sna->kgem.mode == KGEM_BLT &&
3792	    gen7_render_clear_try_blt(sna, dst, bo))
3793		return true;
3794
3795	/* Must use the BLT if we can't RENDER... */
3796	if (too_large(dst->drawable.width, dst->drawable.height))
3797		return gen7_render_clear_try_blt(sna, dst, bo);
3798
3799	tmp.dst.pixmap = dst;
3800	tmp.dst.width  = dst->drawable.width;
3801	tmp.dst.height = dst->drawable.height;
3802	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
3803	tmp.dst.bo = bo;
3804	tmp.dst.x = tmp.dst.y = 0;
3805
3806	tmp.src.bo = sna_render_get_solid(sna, 0);
3807	tmp.mask.bo = NULL;
3808
3809	tmp.floats_per_vertex = 2;
3810	tmp.floats_per_rect = 6;
3811	tmp.need_magic_ca_pass = false;
3812
3813	tmp.u.gen7.flags = FILL_FLAGS_NOBLEND;
3814
3815	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
3816	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3817		kgem_submit(&sna->kgem);
3818		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3819			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3820			return false;
3821		}
3822		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3823	}
3824
3825	gen7_align_vertex(sna, &tmp);
3826	gen7_emit_fill_state(sna, &tmp);
3827
3828	gen7_get_rectangles(sna, &tmp, 1, gen7_emit_fill_state);
3829
3830	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3831	sna->render.vertex_used += 6;
3832	assert(sna->render.vertex_used <= sna->render.vertex_size);
3833
3834	v[0] = dst->drawable.width;
3835	v[5] = v[1] = dst->drawable.height;
3836	v[8] = v[4] = 0;
3837	v[9] = 0;
3838
3839	v[7] = v[2]  = v[3]  = 1;
3840	v[6] = v[10] = v[11] = 0;
3841
3842	gen4_vertex_flush(sna);
3843	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3844
3845	return true;
3846}
3847static void gen7_render_reset(struct sna *sna)
3848{
3849	sna->render_state.gen7.pipe_controls_since_stall = 0;
3850	sna->render_state.gen7.emit_flush = false;
3851	sna->render_state.gen7.needs_invariant = true;
3852	sna->render_state.gen7.ve_id = 3 << 2;
3853	sna->render_state.gen7.last_primitive = -1;
3854
3855	sna->render_state.gen7.num_sf_outputs = 0;
3856	sna->render_state.gen7.samplers = -1;
3857	sna->render_state.gen7.blend = -1;
3858	sna->render_state.gen7.kernel = -1;
3859	sna->render_state.gen7.drawrect_offset = -1;
3860	sna->render_state.gen7.drawrect_limit = -1;
3861	sna->render_state.gen7.surface_table = 0;
3862
3863	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
3864		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
3865		discard_vbo(sna);
3866	}
3867
3868	sna->render.vertex_offset = 0;
3869	sna->render.nvertex_reloc = 0;
3870	sna->render.vb_id = 0;
3871}
3872
3873static void gen7_render_fini(struct sna *sna)
3874{
3875	kgem_bo_destroy(&sna->kgem, sna->render_state.gen7.general_bo);
3876}
3877
3878static bool is_gt3(struct sna *sna, int devid)
3879{
3880	assert(sna->kgem.gen == 075);
3881	return devid & 0x20;
3882}
3883
3884static bool is_gt2(struct sna *sna, int devid)
3885{
3886	return devid & (is_hsw(sna)? 0x30 : 0x20);
3887}
3888
3889static bool is_mobile(struct sna *sna, int devid)
3890{
3891	return (devid & 0xf) == 0x6;
3892}
3893
3894static bool gen7_render_setup(struct sna *sna, int devid)
3895{
3896	struct gen7_render_state *state = &sna->render_state.gen7;
3897	struct sna_static_stream general;
3898	struct gen7_sampler_state *ss;
3899	int i, j, k, l, m;
3900
3901	if (is_ivb(sna)) {
3902		state->info = &ivb_gt_info;
3903		if (devid & 0xf) {
3904			state->info = &ivb_gt1_info;
3905			if (is_gt2(sna, devid))
3906				state->info = &ivb_gt2_info; /* XXX requires GT_MODE WiZ disabled */
3907		}
3908	} else if (is_byt(sna)) {
3909		state->info = &byt_gt_info;
3910	} else if (is_hsw(sna)) {
3911		state->info = &hsw_gt_info;
3912		if (devid & 0xf) {
3913			if (is_gt3(sna, devid))
3914				state->info = &hsw_gt3_info;
3915			else if (is_gt2(sna, devid))
3916				state->info = &hsw_gt2_info;
3917			else
3918				state->info = &hsw_gt1_info;
3919		}
3920	} else
3921		return false;
3922
3923	state->gt = state->info->gt;
3924
3925	sna_static_stream_init(&general);
3926
3927	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
3928	 * dumps, you know it points to zero.
3929	 */
3930	null_create(&general);
3931
3932	for (m = 0; m < GEN7_WM_KERNEL_COUNT; m++) {
3933		if (wm_kernels[m].size) {
3934			state->wm_kernel[m][1] =
3935				sna_static_stream_add(&general,
3936						      wm_kernels[m].data,
3937						      wm_kernels[m].size,
3938						      64);
3939		} else {
3940			if (USE_8_PIXEL_DISPATCH) {
3941				state->wm_kernel[m][0] =
3942					sna_static_stream_compile_wm(sna, &general,
3943								     wm_kernels[m].data, 8);
3944			}
3945
3946			if (USE_16_PIXEL_DISPATCH) {
3947				state->wm_kernel[m][1] =
3948					sna_static_stream_compile_wm(sna, &general,
3949								     wm_kernels[m].data, 16);
3950			}
3951
3952			if (USE_32_PIXEL_DISPATCH) {
3953				state->wm_kernel[m][2] =
3954					sna_static_stream_compile_wm(sna, &general,
3955								     wm_kernels[m].data, 32);
3956			}
3957		}
3958		assert(state->wm_kernel[m][0]|state->wm_kernel[m][1]|state->wm_kernel[m][2]);
3959	}
3960
3961	ss = sna_static_stream_map(&general,
3962				   2 * sizeof(*ss) *
3963				   (2 +
3964				    FILTER_COUNT * EXTEND_COUNT *
3965				    FILTER_COUNT * EXTEND_COUNT),
3966				   32);
3967	state->wm_state = sna_static_stream_offsetof(&general, ss);
3968	sampler_copy_init(ss); ss += 2;
3969	sampler_fill_init(ss); ss += 2;
3970	for (i = 0; i < FILTER_COUNT; i++) {
3971		for (j = 0; j < EXTEND_COUNT; j++) {
3972			for (k = 0; k < FILTER_COUNT; k++) {
3973				for (l = 0; l < EXTEND_COUNT; l++) {
3974					sampler_state_init(ss++, i, j);
3975					sampler_state_init(ss++, k, l);
3976				}
3977			}
3978		}
3979	}
3980
3981	state->cc_blend = gen7_composite_create_blend_state(&general);
3982
3983	state->general_bo = sna_static_stream_fini(sna, &general);
3984	return state->general_bo != NULL;
3985}
3986
3987const char *gen7_render_init(struct sna *sna, const char *backend)
3988{
3989	int devid = intel_get_device_id(sna->dev);
3990
3991	if (!gen7_render_setup(sna, devid))
3992		return backend;
3993
3994	sna->kgem.context_switch = gen6_render_context_switch;
3995	sna->kgem.retire = gen6_render_retire;
3996	sna->kgem.expire = gen4_render_expire;
3997
3998#if !NO_COMPOSITE
3999	sna->render.composite = gen7_render_composite;
4000	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
4001#endif
4002#if !NO_COMPOSITE_SPANS
4003	sna->render.check_composite_spans = gen7_check_composite_spans;
4004	sna->render.composite_spans = gen7_render_composite_spans;
4005	if (is_mobile(sna, devid) || is_gt2(sna, devid) || is_byt(sna))
4006		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
4007#endif
4008	sna->render.video = gen7_render_video;
4009
4010#if !NO_COPY_BOXES
4011	sna->render.copy_boxes = gen7_render_copy_boxes;
4012#endif
4013#if !NO_COPY
4014	sna->render.copy = gen7_render_copy;
4015#endif
4016
4017#if !NO_FILL_BOXES
4018	sna->render.fill_boxes = gen7_render_fill_boxes;
4019#endif
4020#if !NO_FILL
4021	sna->render.fill = gen7_render_fill;
4022#endif
4023#if !NO_FILL_ONE
4024	sna->render.fill_one = gen7_render_fill_one;
4025#endif
4026#if !NO_FILL_CLEAR
4027	sna->render.clear = gen7_render_clear;
4028#endif
4029
4030	sna->render.flush = gen4_render_flush;
4031	sna->render.reset = gen7_render_reset;
4032	sna->render.fini = gen7_render_fini;
4033
4034	sna->render.max_3d_size = GEN7_MAX_SIZE;
4035	sna->render.max_3d_pitch = 1 << 18;
4036	return sna->render_state.gen7.info->name;
4037}
4038