1/*
2 * Copyright © 2012,2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Chris Wilson <chris@chris-wilson.co.uk>
25 *
26 */
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "sna.h"
33#include "sna_reg.h"
34#include "sna_render.h"
35#include "sna_render_inline.h"
36#include "sna_video.h"
37
38#include "gen8_render.h"
39#include "gen8_eu.h"
40#include "gen4_common.h"
41#include "gen4_source.h"
42#include "gen4_vertex.h"
43#include "gen6_common.h"
44#include "gen8_vertex.h"
45
46#define SIM 1
47
48#define ALWAYS_INVALIDATE 0
49#define ALWAYS_FLUSH 0
50#define ALWAYS_STALL 0
51
52#define NO_COMPOSITE 0
53#define NO_COMPOSITE_SPANS 0
54#define NO_COPY 0
55#define NO_COPY_BOXES 0
56#define NO_FILL 0
57#define NO_FILL_BOXES 0
58#define NO_FILL_ONE 0
59#define NO_FILL_CLEAR 0
60#define NO_VIDEO 0
61
62#define USE_8_PIXEL_DISPATCH 1
63#define USE_16_PIXEL_DISPATCH 1
64#define USE_32_PIXEL_DISPATCH 0
65
66#if !USE_8_PIXEL_DISPATCH && !USE_16_PIXEL_DISPATCH && !USE_32_PIXEL_DISPATCH
67#error "Must select at least 8, 16 or 32 pixel dispatch"
68#endif
69
70#define GEN8_MAX_SIZE 16384
71
72/* XXX Todo
73 *
74 * STR (software tiled rendering) mode. No, really.
75 * 64x32 pixel blocks align with the rendering cache. Worth considering.
76 */
77
78#define is_aligned(x, y) (((x) & ((y) - 1)) == 0)
79
80/* Pipeline stages:
81 *  1. Command Streamer (CS)
82 *  2. Vertex Fetch (VF)
83 *  3. Vertex Shader (VS)
84 *  4. Hull Shader (HS)
85 *  5. Tesselation Engine (TE)
86 *  6. Domain Shader (DS)
87 *  7. Geometry Shader (GS)
88 *  8. Stream Output Logic (SOL)
89 *  9. Clipper (CLIP)
90 * 10. Strip/Fan (SF)
91 * 11. Windower/Masker (WM)
92 * 12. Color Calculator (CC)
93 */
94
95#if !NO_VIDEO
96static const uint32_t ps_kernel_packed[][4] = {
97#include "exa_wm_src_affine.g8b"
98#include "exa_wm_src_sample_argb.g8b"
99#include "exa_wm_yuv_rgb.g8b"
100#include "exa_wm_write.g8b"
101};
102
103static const uint32_t ps_kernel_planar[][4] = {
104#include "exa_wm_src_affine.g8b"
105#include "exa_wm_src_sample_planar.g8b"
106#include "exa_wm_yuv_rgb.g8b"
107#include "exa_wm_write.g8b"
108};
109#endif
110
111#define SURFACE_DW (64 / sizeof(uint32_t));
112
113#define KERNEL(kernel_enum, kernel, num_surfaces) \
114    [GEN8_WM_KERNEL_##kernel_enum] = {#kernel_enum, kernel, sizeof(kernel), num_surfaces}
115#define NOKERNEL(kernel_enum, func, num_surfaces) \
116    [GEN8_WM_KERNEL_##kernel_enum] = {#kernel_enum, (void *)func, 0, num_surfaces}
117static const struct wm_kernel_info {
118	const char *name;
119	const void *data;
120	unsigned int size;
121	int num_surfaces;
122} wm_kernels[] = {
123	NOKERNEL(NOMASK, gen8_wm_kernel__affine, 2),
124	NOKERNEL(NOMASK_P, gen8_wm_kernel__projective, 2),
125
126	NOKERNEL(MASK, gen8_wm_kernel__affine_mask, 3),
127	NOKERNEL(MASK_P, gen8_wm_kernel__projective_mask, 3),
128
129	NOKERNEL(MASKCA, gen8_wm_kernel__affine_mask_ca, 3),
130	NOKERNEL(MASKCA_P, gen8_wm_kernel__projective_mask_ca, 3),
131
132	NOKERNEL(MASKSA, gen8_wm_kernel__affine_mask_sa, 3),
133	NOKERNEL(MASKSA_P, gen8_wm_kernel__projective_mask_sa, 3),
134
135	NOKERNEL(OPACITY, gen8_wm_kernel__affine_opacity, 2),
136	NOKERNEL(OPACITY_P, gen8_wm_kernel__projective_opacity, 2),
137
138#if !NO_VIDEO
139	KERNEL(VIDEO_PLANAR, ps_kernel_planar, 7),
140	KERNEL(VIDEO_PACKED, ps_kernel_packed, 2),
141#endif
142};
143#undef KERNEL
144
145static const struct blendinfo {
146	uint8_t src_alpha;
147	uint8_t src_blend;
148	uint8_t dst_blend;
149} gen8_blend_op[] = {
150	/* Clear */	{0, BLENDFACTOR_ZERO, BLENDFACTOR_ZERO},
151	/* Src */	{0, BLENDFACTOR_ONE, BLENDFACTOR_ZERO},
152	/* Dst */	{0, BLENDFACTOR_ZERO, BLENDFACTOR_ONE},
153	/* Over */	{1, BLENDFACTOR_ONE, BLENDFACTOR_INV_SRC_ALPHA},
154	/* OverReverse */ {0, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_ONE},
155	/* In */	{0, BLENDFACTOR_DST_ALPHA, BLENDFACTOR_ZERO},
156	/* InReverse */	{1, BLENDFACTOR_ZERO, BLENDFACTOR_SRC_ALPHA},
157	/* Out */	{0, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_ZERO},
158	/* OutReverse */ {1, BLENDFACTOR_ZERO, BLENDFACTOR_INV_SRC_ALPHA},
159	/* Atop */	{1, BLENDFACTOR_DST_ALPHA, BLENDFACTOR_INV_SRC_ALPHA},
160	/* AtopReverse */ {1, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_SRC_ALPHA},
161	/* Xor */	{1, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_INV_SRC_ALPHA},
162	/* Add */	{0, BLENDFACTOR_ONE, BLENDFACTOR_ONE},
163};
164
165/**
166 * Highest-valued BLENDFACTOR used in gen8_blend_op.
167 *
168 * This leaves out GEN8_BLENDFACTOR_INV_DST_COLOR,
169 * GEN8_BLENDFACTOR_INV_CONST_{COLOR,ALPHA},
170 * GEN8_BLENDFACTOR_INV_SRC1_{COLOR,ALPHA}
171 */
172#define GEN8_BLENDFACTOR_COUNT (BLENDFACTOR_INV_DST_ALPHA + 1)
173
174#define GEN8_BLEND_STATE_PADDED_SIZE	ALIGN(sizeof(struct gen8_blend_state), 64)
175
176#define BLEND_OFFSET(s, d) \
177	((d != BLENDFACTOR_ZERO) << 15 | ((s) * GEN8_BLENDFACTOR_COUNT + (d)) << 4)
178
179#define NO_BLEND BLEND_OFFSET(BLENDFACTOR_ONE, BLENDFACTOR_ZERO)
180#define CLEAR BLEND_OFFSET(BLENDFACTOR_ZERO, BLENDFACTOR_ZERO)
181
182#define SAMPLER_OFFSET(sf, se, mf, me) \
183	(((((sf) * EXTEND_COUNT + (se)) * FILTER_COUNT + (mf)) * EXTEND_COUNT + (me)) + 2)
184
185#define VERTEX_2s2s 0
186
187#define COPY_SAMPLER 0
188#define COPY_VERTEX VERTEX_2s2s
189#define COPY_FLAGS(a) GEN8_SET_FLAGS(COPY_SAMPLER, (a) == GXcopy ? NO_BLEND : CLEAR, GEN8_WM_KERNEL_NOMASK, COPY_VERTEX)
190
191#define FILL_SAMPLER 1
192#define FILL_VERTEX VERTEX_2s2s
193#define FILL_FLAGS(op, format) GEN8_SET_FLAGS(FILL_SAMPLER, gen8_get_blend((op), false, (format)), GEN8_WM_KERNEL_NOMASK, FILL_VERTEX)
194#define FILL_FLAGS_NOBLEND GEN8_SET_FLAGS(FILL_SAMPLER, NO_BLEND, GEN8_WM_KERNEL_NOMASK, FILL_VERTEX)
195
196#define GEN8_SAMPLER(f) (((f) >> 20) & 0xfff)
197#define GEN8_BLEND(f) (((f) >> 4) & 0x7ff)
198#define GEN8_READS_DST(f) (((f) >> 15) & 1)
199#define GEN8_KERNEL(f) (((f) >> 16) & 0xf)
200#define GEN8_VERTEX(f) (((f) >> 0) & 0xf)
201#define GEN8_SET_FLAGS(S, B, K, V)  ((S) << 20 | (K) << 16 | (B) | (V))
202
203#define OUT_BATCH(v) batch_emit(sna, v)
204#define OUT_BATCH64(v) batch_emit64(sna, v)
205#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
206#define OUT_VERTEX_F(v) vertex_emit(sna, v)
207
208static inline bool too_large(int width, int height)
209{
210	return width > GEN8_MAX_SIZE || height > GEN8_MAX_SIZE;
211}
212
213static inline bool unaligned(struct kgem_bo *bo, int bpp)
214{
215	/* XXX What exactly do we need to meet H_ALIGN and V_ALIGN? */
216#if 0
217	int x, y;
218
219	if (bo->proxy == NULL)
220		return false;
221
222	/* Assume that all tiled proxies are constructed correctly. */
223	if (bo->tiling)
224		return false;
225
226	DBG(("%s: checking alignment of a linear proxy, offset=%d, pitch=%d, bpp=%d: => (%d, %d)\n",
227	     __FUNCTION__, bo->delta, bo->pitch, bpp,
228	     8 * (bo->delta % bo->pitch) / bpp, bo->delta / bo->pitch));
229
230	/* This may be a random userptr map, check that it meets the
231	 * render alignment of SURFACE_VALIGN_4 | SURFACE_HALIGN_4.
232	 */
233	y = bo->delta / bo->pitch;
234	if (y & 3)
235		return true;
236
237	x = 8 * (bo->delta - y * bo->pitch);
238	if (x & (4*bpp - 1))
239	    return true;
240
241	return false;
242#else
243	return false;
244#endif
245}
246
247static uint32_t gen8_get_blend(int op,
248			       bool has_component_alpha,
249			       uint32_t dst_format)
250{
251	uint32_t src, dst;
252
253	COMPILE_TIME_ASSERT(BLENDFACTOR_INV_DST_ALPHA*GEN8_BLENDFACTOR_COUNT + BLENDFACTOR_INV_DST_ALPHA <= 0x7ff);
254
255	src = gen8_blend_op[op].src_blend;
256	dst = gen8_blend_op[op].dst_blend;
257
258	/* If there's no dst alpha channel, adjust the blend op so that
259	 * we'll treat it always as 1.
260	 */
261	if (PICT_FORMAT_A(dst_format) == 0) {
262		if (src == BLENDFACTOR_DST_ALPHA)
263			src = BLENDFACTOR_ONE;
264		else if (src == BLENDFACTOR_INV_DST_ALPHA)
265			src = BLENDFACTOR_ZERO;
266	}
267
268	/* If the source alpha is being used, then we should only be in a
269	 * case where the source blend factor is 0, and the source blend
270	 * value is the mask channels multiplied by the source picture's alpha.
271	 */
272	if (has_component_alpha && gen8_blend_op[op].src_alpha) {
273		if (dst == BLENDFACTOR_SRC_ALPHA)
274			dst = BLENDFACTOR_SRC_COLOR;
275		else if (dst == BLENDFACTOR_INV_SRC_ALPHA)
276			dst = BLENDFACTOR_INV_SRC_COLOR;
277	}
278
279	DBG(("blend op=%d, dst=%x [A=%d] => src=%d, dst=%d => offset=%x\n",
280	     op, dst_format, PICT_FORMAT_A(dst_format),
281	     src, dst, (int)(BLEND_OFFSET(src, dst)>>4)));
282	assert(BLEND_OFFSET(src, dst) >> 4 <= 0xfff);
283	return BLEND_OFFSET(src, dst);
284}
285
286static uint32_t gen8_get_card_format(PictFormat format)
287{
288	switch (format) {
289	default:
290		return -1;
291	case PICT_a8r8g8b8:
292		return SURFACEFORMAT_B8G8R8A8_UNORM;
293	case PICT_x8r8g8b8:
294		return SURFACEFORMAT_B8G8R8X8_UNORM;
295	case PICT_a8b8g8r8:
296		return SURFACEFORMAT_R8G8B8A8_UNORM;
297	case PICT_x8b8g8r8:
298		return SURFACEFORMAT_R8G8B8X8_UNORM;
299#ifdef PICT_a2r10g10b10
300	case PICT_a2r10g10b10:
301		return SURFACEFORMAT_B10G10R10A2_UNORM;
302	case PICT_x2r10g10b10:
303		return SURFACEFORMAT_B10G10R10X2_UNORM;
304#endif
305	case PICT_r8g8b8:
306		return SURFACEFORMAT_R8G8B8_UNORM;
307	case PICT_r5g6b5:
308		return SURFACEFORMAT_B5G6R5_UNORM;
309	case PICT_a1r5g5b5:
310		return SURFACEFORMAT_B5G5R5A1_UNORM;
311	case PICT_a8:
312		return SURFACEFORMAT_A8_UNORM;
313	case PICT_a4r4g4b4:
314		return SURFACEFORMAT_B4G4R4A4_UNORM;
315	}
316}
317
318static uint32_t gen8_get_dest_format(PictFormat format)
319{
320	switch (format) {
321	default:
322		return -1;
323	case PICT_a8r8g8b8:
324	case PICT_x8r8g8b8:
325		return SURFACEFORMAT_B8G8R8A8_UNORM;
326	case PICT_a8b8g8r8:
327	case PICT_x8b8g8r8:
328		return SURFACEFORMAT_R8G8B8A8_UNORM;
329#ifdef PICT_a2r10g10b10
330	case PICT_a2r10g10b10:
331	case PICT_x2r10g10b10:
332		return SURFACEFORMAT_B10G10R10A2_UNORM;
333#endif
334	case PICT_r5g6b5:
335		return SURFACEFORMAT_B5G6R5_UNORM;
336	case PICT_x1r5g5b5:
337	case PICT_a1r5g5b5:
338		return SURFACEFORMAT_B5G5R5A1_UNORM;
339	case PICT_a8:
340		return SURFACEFORMAT_A8_UNORM;
341	case PICT_a4r4g4b4:
342	case PICT_x4r4g4b4:
343		return SURFACEFORMAT_B4G4R4A4_UNORM;
344	}
345}
346
347static bool gen8_check_dst_format(PictFormat format)
348{
349	if (gen8_get_dest_format(format) != -1)
350		return true;
351
352	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
353	return false;
354}
355
356static bool gen8_check_format(uint32_t format)
357{
358	if (gen8_get_card_format(format) != -1)
359		return true;
360
361	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
362	return false;
363}
364
365static uint32_t gen8_filter(uint32_t filter)
366{
367	switch (filter) {
368	default:
369		assert(0);
370	case PictFilterNearest:
371		return SAMPLER_FILTER_NEAREST;
372	case PictFilterBilinear:
373		return SAMPLER_FILTER_BILINEAR;
374	}
375}
376
377static uint32_t gen8_check_filter(PicturePtr picture)
378{
379	switch (picture->filter) {
380	case PictFilterNearest:
381	case PictFilterBilinear:
382		return true;
383	default:
384		return false;
385	}
386}
387
388static uint32_t gen8_repeat(uint32_t repeat)
389{
390	switch (repeat) {
391	default:
392		assert(0);
393	case RepeatNone:
394		return SAMPLER_EXTEND_NONE;
395	case RepeatNormal:
396		return SAMPLER_EXTEND_REPEAT;
397	case RepeatPad:
398		return SAMPLER_EXTEND_PAD;
399	case RepeatReflect:
400		return SAMPLER_EXTEND_REFLECT;
401	}
402}
403
404static bool gen8_check_repeat(PicturePtr picture)
405{
406	if (!picture->repeat)
407		return true;
408
409	switch (picture->repeatType) {
410	case RepeatNone:
411	case RepeatNormal:
412	case RepeatPad:
413	case RepeatReflect:
414		return true;
415	default:
416		return false;
417	}
418}
419
420static int
421gen8_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
422{
423	int base;
424
425	if (has_mask) {
426		if (is_ca) {
427			if (gen8_blend_op[op].src_alpha)
428				base = GEN8_WM_KERNEL_MASKSA;
429			else
430				base = GEN8_WM_KERNEL_MASKCA;
431		} else
432			base = GEN8_WM_KERNEL_MASK;
433	} else
434		base = GEN8_WM_KERNEL_NOMASK;
435
436	return base + !is_affine;
437}
438
439static void
440gen8_emit_push_constants(struct sna *sna)
441{
442#if SIM
443	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_VS | (2 - 2));
444	OUT_BATCH(0);
445
446	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS | (2 - 2));
447	OUT_BATCH(0);
448
449	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS | (2 - 2));
450	OUT_BATCH(0);
451
452	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS | (2 - 2));
453	OUT_BATCH(0);
454
455	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_PS | (2 - 2));
456	OUT_BATCH(0);
457#endif
458}
459
460static void
461gen8_emit_urb(struct sna *sna)
462{
463	/* num of VS entries must be divisible by 8 if size < 9 */
464	OUT_BATCH(GEN8_3DSTATE_URB_VS | (2 - 2));
465	OUT_BATCH(960 << URB_ENTRY_NUMBER_SHIFT |
466		  (2 - 1) << URB_ENTRY_SIZE_SHIFT |
467		  4 << URB_STARTING_ADDRESS_SHIFT);
468
469	OUT_BATCH(GEN8_3DSTATE_URB_HS | (2 - 2));
470	OUT_BATCH(0 << URB_ENTRY_SIZE_SHIFT |
471		  4 << URB_STARTING_ADDRESS_SHIFT);
472
473	OUT_BATCH(GEN8_3DSTATE_URB_DS | (2 - 2));
474	OUT_BATCH(0 << URB_ENTRY_SIZE_SHIFT |
475		  4 << URB_STARTING_ADDRESS_SHIFT);
476
477	OUT_BATCH(GEN8_3DSTATE_URB_GS | (2 - 2));
478	OUT_BATCH(0 << URB_ENTRY_SIZE_SHIFT |
479		  4 << URB_STARTING_ADDRESS_SHIFT);
480}
481
482static void
483gen8_emit_state_base_address(struct sna *sna)
484{
485	uint32_t num_pages;
486
487	assert(sna->kgem.surface - sna->kgem.nbatch <= 16384);
488
489	OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (16 - 2));
490	OUT_BATCH64(0); /* general */
491	OUT_BATCH(0); /* stateless dataport */
492	OUT_BATCH64(kgem_add_reloc64(&sna->kgem, /* surface */
493				     sna->kgem.nbatch,
494				     NULL,
495				     I915_GEM_DOMAIN_INSTRUCTION << 16,
496				     BASE_ADDRESS_MODIFY));
497	OUT_BATCH64(kgem_add_reloc64(&sna->kgem, /* dynamic */
498				     sna->kgem.nbatch,
499				     sna->render_state.gen8.general_bo,
500				     I915_GEM_DOMAIN_INSTRUCTION << 16,
501				     BASE_ADDRESS_MODIFY));
502	OUT_BATCH64(0); /* indirect */
503	OUT_BATCH64(kgem_add_reloc64(&sna->kgem, /* instruction */
504				     sna->kgem.nbatch,
505				     sna->render_state.gen8.general_bo,
506				     I915_GEM_DOMAIN_INSTRUCTION << 16,
507				     BASE_ADDRESS_MODIFY));
508	/* upper bounds */
509	num_pages = sna->render_state.gen8.general_bo->size.pages.count;
510	OUT_BATCH(0); /* general */
511	OUT_BATCH(num_pages << 12 | 1); /* dynamic */
512	OUT_BATCH(0); /* indirect */
513	OUT_BATCH(num_pages << 12 | 1); /* instruction */
514}
515
516static void
517gen8_emit_vs_invariant(struct sna *sna)
518{
519	OUT_BATCH(GEN8_3DSTATE_VS | (9 - 2));
520	OUT_BATCH64(0); /* no VS kernel */
521	OUT_BATCH(0);
522	OUT_BATCH64(0); /* scratch */
523	OUT_BATCH(0);
524	OUT_BATCH(1 << 1); /* pass-through */
525	OUT_BATCH(1 << 16 | 1 << 21); /* urb write to SBE */
526
527#if SIM
528	OUT_BATCH(GEN8_3DSTATE_CONSTANT_VS | (11 - 2));
529	OUT_BATCH(0);
530	OUT_BATCH(0);
531	OUT_BATCH64(0);
532	OUT_BATCH64(0);
533	OUT_BATCH64(0);
534	OUT_BATCH64(0);
535
536	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS | (2 - 2));
537	OUT_BATCH(0);
538
539	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS | (2 - 2));
540	OUT_BATCH(0);
541#endif
542}
543
544static void
545gen8_emit_hs_invariant(struct sna *sna)
546{
547	OUT_BATCH(GEN8_3DSTATE_HS | (9 - 2));
548	OUT_BATCH(0);
549	OUT_BATCH(0);
550	OUT_BATCH64(0); /* no HS kernel */
551	OUT_BATCH64(0); /* scratch */
552	OUT_BATCH(0);
553	OUT_BATCH(0); /* pass-through */
554
555#if SIM
556	OUT_BATCH(GEN8_3DSTATE_CONSTANT_HS | (11 - 2));
557	OUT_BATCH(0);
558	OUT_BATCH(0);
559	OUT_BATCH64(0);
560	OUT_BATCH64(0);
561	OUT_BATCH64(0);
562	OUT_BATCH64(0);
563
564#if 1
565	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_HS | (2 - 2));
566	OUT_BATCH(0);
567
568	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS | (2 - 2));
569	OUT_BATCH(0);
570#endif
571#endif
572}
573
574static void
575gen8_emit_te_invariant(struct sna *sna)
576{
577	OUT_BATCH(GEN8_3DSTATE_TE | (4 - 2));
578	OUT_BATCH(0);
579	OUT_BATCH(0);
580	OUT_BATCH(0);
581}
582
583static void
584gen8_emit_ds_invariant(struct sna *sna)
585{
586	OUT_BATCH(GEN8_3DSTATE_DS | (9 - 2));
587	OUT_BATCH64(0); /* no kernel */
588	OUT_BATCH(0);
589	OUT_BATCH64(0); /* scratch */
590	OUT_BATCH(0);
591	OUT_BATCH(0);
592	OUT_BATCH(0);
593
594#if SIM
595	OUT_BATCH(GEN8_3DSTATE_CONSTANT_DS | (11 - 2));
596	OUT_BATCH(0);
597	OUT_BATCH(0);
598	OUT_BATCH64(0);
599	OUT_BATCH64(0);
600	OUT_BATCH64(0);
601	OUT_BATCH64(0);
602
603#if 1
604	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_DS | (2 - 2));
605	OUT_BATCH(0);
606
607	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS | (2 - 2));
608	OUT_BATCH(0);
609#endif
610#endif
611}
612
613static void
614gen8_emit_gs_invariant(struct sna *sna)
615{
616	OUT_BATCH(GEN8_3DSTATE_GS | (10 - 2));
617	OUT_BATCH64(0); /* no GS kernel */
618	OUT_BATCH(0);
619	OUT_BATCH64(0); /* scratch */
620	OUT_BATCH(0);
621	OUT_BATCH(0); /* pass-through */
622	OUT_BATCH(0);
623	OUT_BATCH(0);
624
625#if SIM
626	OUT_BATCH(GEN8_3DSTATE_CONSTANT_GS | (11 - 2));
627	OUT_BATCH(0);
628	OUT_BATCH(0);
629	OUT_BATCH64(0);
630	OUT_BATCH64(0);
631	OUT_BATCH64(0);
632	OUT_BATCH64(0);
633
634#if 1
635	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_GS | (2 - 2));
636	OUT_BATCH(0);
637
638	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_GS | (2 - 2));
639	OUT_BATCH(0);
640#endif
641#endif
642}
643
644static void
645gen8_emit_sol_invariant(struct sna *sna)
646{
647	OUT_BATCH(GEN8_3DSTATE_STREAMOUT | (5 - 2));
648	OUT_BATCH(0);
649	OUT_BATCH(0);
650	OUT_BATCH(0);
651	OUT_BATCH(0);
652}
653
654static void
655gen8_emit_sf_invariant(struct sna *sna)
656{
657	OUT_BATCH(GEN8_3DSTATE_SF | (4 - 2));
658	OUT_BATCH(0);
659	OUT_BATCH(0);
660	OUT_BATCH(0);
661}
662
663static void
664gen8_emit_clip_invariant(struct sna *sna)
665{
666	OUT_BATCH(GEN8_3DSTATE_CLIP | (4 - 2));
667	OUT_BATCH(0);
668	OUT_BATCH(0); /* pass-through */
669	OUT_BATCH(0);
670
671	OUT_BATCH(GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP | (2 - 2));
672	OUT_BATCH(0);
673
674	OUT_BATCH(GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_CC | (2 - 2));
675	OUT_BATCH(0);
676}
677
678static void
679gen8_emit_null_depth_buffer(struct sna *sna)
680{
681	OUT_BATCH(GEN8_3DSTATE_DEPTH_BUFFER | (8 - 2));
682#if 0
683	OUT_BATCH(SURFACE_NULL << DEPTH_BUFFER_TYPE_SHIFT |
684		  DEPTHFORMAT_D32_FLOAT << DEPTH_BUFFER_FORMAT_SHIFT);
685#else
686	OUT_BATCH(SURFACE_2D << DEPTH_BUFFER_TYPE_SHIFT |
687		  DEPTHFORMAT_D16_UNORM << DEPTH_BUFFER_FORMAT_SHIFT);
688#endif
689	OUT_BATCH64(0);
690	OUT_BATCH(0);
691	OUT_BATCH(0);
692	OUT_BATCH(0);
693	OUT_BATCH(0);
694
695#if SIM
696	OUT_BATCH(GEN8_3DSTATE_HIER_DEPTH_BUFFER | (5 - 2));
697	OUT_BATCH(0);
698	OUT_BATCH64(0);
699	OUT_BATCH(0);
700#endif
701
702#if SIM
703	OUT_BATCH(GEN8_3DSTATE_STENCIL_BUFFER | (5 - 2));
704	OUT_BATCH(0);
705	OUT_BATCH64(0);
706	OUT_BATCH(0);
707#endif
708
709#if SIM
710	OUT_BATCH(GEN8_3DSTATE_WM_DEPTH_STENCIL | (3 - 2));
711	OUT_BATCH(0);
712	OUT_BATCH(0);
713#endif
714
715#if SIM
716	OUT_BATCH(GEN8_3DSTATE_CLEAR_PARAMS | (3 - 2));
717	OUT_BATCH(0);
718	OUT_BATCH(0);
719#endif
720}
721
722static void
723gen8_emit_wm_invariant(struct sna *sna)
724{
725	gen8_emit_null_depth_buffer(sna);
726
727#if SIM
728	OUT_BATCH(GEN8_3DSTATE_SCISSOR_STATE_POINTERS | (2 - 2));
729	OUT_BATCH(0);
730#endif
731
732	OUT_BATCH(GEN8_3DSTATE_WM | (2 - 2));
733	//OUT_BATCH(WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC); /* XXX */
734	OUT_BATCH(WM_PERSPECTIVE_PIXEL_BARYCENTRIC);
735
736#if SIM
737	OUT_BATCH(GEN8_3DSTATE_WM_CHROMAKEY | (2 - 2));
738	OUT_BATCH(0);
739#endif
740
741#if 0
742	OUT_BATCH(GEN8_3DSTATE_WM_HZ_OP | (5 - 2));
743	OUT_BATCH(0);
744	OUT_BATCH(0);
745	OUT_BATCH(0);
746	OUT_BATCH(0);
747#endif
748
749	OUT_BATCH(GEN8_3DSTATE_PS_EXTRA | (2 - 2));
750	OUT_BATCH(PSX_PIXEL_SHADER_VALID |
751		  PSX_ATTRIBUTE_ENABLE);
752
753	OUT_BATCH(GEN8_3DSTATE_RASTER | (5 - 2));
754	OUT_BATCH(RASTER_FRONT_WINDING_CCW |
755		  RASTER_CULL_NONE);
756	OUT_BATCH(0);
757	OUT_BATCH(0);
758	OUT_BATCH(0);
759
760	OUT_BATCH(GEN8_3DSTATE_SBE_SWIZ | (11 - 2));
761	OUT_BATCH(0);
762	OUT_BATCH(0);
763	OUT_BATCH(0);
764	OUT_BATCH(0);
765	OUT_BATCH(0);
766	OUT_BATCH(0);
767	OUT_BATCH(0);
768	OUT_BATCH(0);
769	OUT_BATCH(0);
770	OUT_BATCH(0);
771
772#if SIM
773	OUT_BATCH(GEN8_3DSTATE_CONSTANT_PS | (11 - 2));
774	OUT_BATCH(0);
775	OUT_BATCH(0);
776	OUT_BATCH64(0);
777	OUT_BATCH64(0);
778	OUT_BATCH64(0);
779	OUT_BATCH64(0);
780#endif
781}
782
783static void
784gen8_emit_cc_invariant(struct sna *sna)
785{
786}
787
788static void
789gen8_emit_vf_invariant(struct sna *sna)
790{
791	int n;
792
793#if 1
794	OUT_BATCH(GEN8_3DSTATE_VF | (2 - 2));
795	OUT_BATCH(0);
796#endif
797
798	OUT_BATCH(GEN8_3DSTATE_VF_SGVS | (2 - 2));
799	OUT_BATCH(0);
800
801	OUT_BATCH(GEN8_3DSTATE_VF_TOPOLOGY | (2 - 2));
802	OUT_BATCH(RECTLIST);
803
804	OUT_BATCH(GEN8_3DSTATE_VF_STATISTICS | 0);
805
806	for (n = 1; n <= 3; n++) {
807		OUT_BATCH(GEN8_3DSTATE_VF_INSTANCING | (3 - 2));
808		OUT_BATCH(n);
809		OUT_BATCH(0);
810	}
811}
812
813static void
814gen8_emit_invariant(struct sna *sna)
815{
816	OUT_BATCH(GEN8_PIPELINE_SELECT | PIPELINE_SELECT_3D);
817
818#if SIM
819	OUT_BATCH(GEN8_STATE_SIP | (3 - 2));
820	OUT_BATCH64(0);
821#endif
822
823	OUT_BATCH(GEN8_3DSTATE_MULTISAMPLE | (2 - 2));
824	OUT_BATCH(MULTISAMPLE_PIXEL_LOCATION_CENTER |
825		  MULTISAMPLE_NUMSAMPLES_1); /* 1 sample/pixel */
826
827	OUT_BATCH(GEN8_3DSTATE_SAMPLE_MASK | (2 - 2));
828	OUT_BATCH(1);
829
830#if SIM
831	OUT_BATCH(GEN8_3DSTATE_SAMPLE_PATTERN | (5 - 2));
832	OUT_BATCH(0);
833	OUT_BATCH(0);
834	OUT_BATCH(0);
835	//OUT_BATCH(8<<20 | 8<<16);
836	OUT_BATCH(0);
837#endif
838
839	gen8_emit_push_constants(sna);
840	gen8_emit_urb(sna);
841
842	gen8_emit_state_base_address(sna);
843
844	gen8_emit_vf_invariant(sna);
845	gen8_emit_vs_invariant(sna);
846	gen8_emit_hs_invariant(sna);
847	gen8_emit_te_invariant(sna);
848	gen8_emit_ds_invariant(sna);
849	gen8_emit_gs_invariant(sna);
850	gen8_emit_sol_invariant(sna);
851	gen8_emit_clip_invariant(sna);
852	gen8_emit_sf_invariant(sna);
853	gen8_emit_wm_invariant(sna);
854	gen8_emit_cc_invariant(sna);
855
856	sna->render_state.gen8.needs_invariant = false;
857}
858
859static void
860gen8_emit_cc(struct sna *sna, uint32_t blend)
861{
862	struct gen8_render_state *render = &sna->render_state.gen8;
863
864	if (render->blend == blend)
865		return;
866
867	DBG(("%s: blend=%x (current=%x), src=%d, dst=%d\n",
868	     __FUNCTION__, blend, render->blend,
869	     blend / GEN8_BLENDFACTOR_COUNT,
870	     blend % GEN8_BLENDFACTOR_COUNT));
871
872	assert(blend < GEN8_BLENDFACTOR_COUNT * GEN8_BLENDFACTOR_COUNT);
873	assert(blend / GEN8_BLENDFACTOR_COUNT > 0);
874	assert(blend % GEN8_BLENDFACTOR_COUNT > 0);
875
876	/* XXX can have upto 8 blend states preload, selectable via
877	 * Render Target Index. What other side-effects of Render Target Index?
878	 */
879
880	OUT_BATCH(GEN8_3DSTATE_PS_BLEND | (2 - 2));
881	if (blend != GEN8_BLEND(NO_BLEND)) {
882		uint32_t src = blend / GEN8_BLENDFACTOR_COUNT;
883		uint32_t dst = blend % GEN8_BLENDFACTOR_COUNT;
884		OUT_BATCH(PS_BLEND_HAS_WRITEABLE_RT |
885			  PS_BLEND_COLOR_BLEND_ENABLE |
886			  src << PS_BLEND_SRC_ALPHA_SHIFT |
887			  dst << PS_BLEND_DST_ALPHA_SHIFT |
888			  src << PS_BLEND_SRC_SHIFT |
889			  dst << PS_BLEND_DST_SHIFT);
890	} else
891		OUT_BATCH(PS_BLEND_HAS_WRITEABLE_RT);
892
893	assert(is_aligned(render->cc_blend + blend * GEN8_BLEND_STATE_PADDED_SIZE, 64));
894	OUT_BATCH(GEN8_3DSTATE_BLEND_STATE_POINTERS | (2 - 2));
895	OUT_BATCH((render->cc_blend + blend * GEN8_BLEND_STATE_PADDED_SIZE) | 1);
896
897	/* Force a CC_STATE pointer change to improve blend performance */
898	OUT_BATCH(GEN8_3DSTATE_CC_STATE_POINTERS | (2 - 2));
899	OUT_BATCH(0);
900
901	render->blend = blend;
902}
903
904static void
905gen8_emit_sampler(struct sna *sna, uint32_t state)
906{
907	if (sna->render_state.gen8.samplers == state)
908		return;
909
910	sna->render_state.gen8.samplers = state;
911
912	DBG(("%s: sampler = %x\n", __FUNCTION__, state));
913
914	assert(2 * sizeof(struct gen8_sampler_state) == 32);
915	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_PS | (2 - 2));
916	OUT_BATCH(sna->render_state.gen8.wm_state + state * 2 * sizeof(struct gen8_sampler_state));
917}
918
919static void
920gen8_emit_sf(struct sna *sna, bool has_mask)
921{
922	int num_sf_outputs = has_mask ? 2 : 1;
923
924	if (sna->render_state.gen8.num_sf_outputs == num_sf_outputs)
925		return;
926
927	DBG(("%s: num_sf_outputs=%d\n", __FUNCTION__, num_sf_outputs));
928
929	sna->render_state.gen8.num_sf_outputs = num_sf_outputs;
930
931	OUT_BATCH(GEN8_3DSTATE_SBE | (4 - 2));
932	OUT_BATCH(num_sf_outputs << SBE_NUM_OUTPUTS_SHIFT |
933		  SBE_FORCE_VERTEX_URB_READ_LENGTH | /* forced is faster */
934		  SBE_FORCE_VERTEX_URB_READ_OFFSET |
935		  1 << SBE_URB_ENTRY_READ_LENGTH_SHIFT |
936		  1 << SBE_URB_ENTRY_READ_OFFSET_SHIFT);
937	OUT_BATCH(0);
938	OUT_BATCH(0);
939}
940
941static void
942gen8_emit_wm(struct sna *sna, int kernel)
943{
944	const uint32_t *kernels;
945
946	assert(kernel < ARRAY_SIZE(wm_kernels));
947	if (sna->render_state.gen8.kernel == kernel)
948		return;
949
950	sna->render_state.gen8.kernel = kernel;
951	kernels = sna->render_state.gen8.wm_kernel[kernel];
952
953	DBG(("%s: switching to %s, num_surfaces=%d (8-wide? %d, 16-wide? %d, 32-wide? %d)\n",
954	     __FUNCTION__,
955	     wm_kernels[kernel].name,
956	     wm_kernels[kernel].num_surfaces,
957	     kernels[0], kernels[1], kernels[2]));
958	assert(is_aligned(kernels[0], 64));
959	assert(is_aligned(kernels[1], 64));
960	assert(is_aligned(kernels[2], 64));
961
962	OUT_BATCH(GEN8_3DSTATE_PS | (12 - 2));
963	OUT_BATCH64(kernels[0] ?: kernels[1] ?: kernels[2]);
964	OUT_BATCH(1 << PS_SAMPLER_COUNT_SHIFT |
965		  PS_VECTOR_MASK_ENABLE |
966		  wm_kernels[kernel].num_surfaces << PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
967	OUT_BATCH64(0); /* scratch address */
968	OUT_BATCH(PS_MAX_THREADS |
969		  (kernels[0] ? PS_8_DISPATCH_ENABLE : 0) |
970		  (kernels[1] ? PS_16_DISPATCH_ENABLE : 0) |
971		  (kernels[2] ? PS_32_DISPATCH_ENABLE : 0));
972	OUT_BATCH((kernels[0] ? 4 : kernels[1] ? 6 : 8) << PS_DISPATCH_START_GRF_SHIFT_0 |
973		  8 << PS_DISPATCH_START_GRF_SHIFT_1 |
974		  6 << PS_DISPATCH_START_GRF_SHIFT_2);
975	OUT_BATCH64(kernels[2]);
976	OUT_BATCH64(kernels[1]);
977}
978
979static bool
980gen8_emit_binding_table(struct sna *sna, uint16_t offset)
981{
982	if (sna->render_state.gen8.surface_table == offset)
983		return false;
984
985	/* Binding table pointers */
986	assert(is_aligned(4*offset, 32));
987	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_PS | (2 - 2));
988	OUT_BATCH(offset*4);
989
990	sna->render_state.gen8.surface_table = offset;
991	return true;
992}
993
994static bool
995gen8_emit_drawing_rectangle(struct sna *sna,
996			    const struct sna_composite_op *op)
997{
998	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
999	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
1000
1001	assert(!too_large(abs(op->dst.x), abs(op->dst.y)));
1002	assert(!too_large(op->dst.width, op->dst.height));
1003
1004	if (sna->render_state.gen8.drawrect_limit == limit &&
1005	    sna->render_state.gen8.drawrect_offset == offset)
1006		return true;
1007
1008	sna->render_state.gen8.drawrect_offset = offset;
1009	sna->render_state.gen8.drawrect_limit = limit;
1010
1011	OUT_BATCH(GEN8_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
1012	OUT_BATCH(0);
1013	OUT_BATCH(limit);
1014	OUT_BATCH(offset);
1015	return false;
1016}
1017
1018static void
1019gen8_emit_vertex_elements(struct sna *sna,
1020			  const struct sna_composite_op *op)
1021{
1022	/*
1023	 * vertex data in vertex buffer
1024	 *    position: (x, y)
1025	 *    texture coordinate 0: (u0, v0) if (is_affine is true) else (u0, v0, w0)
1026	 *    texture coordinate 1 if (has_mask is true): same as above
1027	 */
1028	struct gen8_render_state *render = &sna->render_state.gen8;
1029	uint32_t src_format, dw;
1030	int id = GEN8_VERTEX(op->u.gen8.flags);
1031	bool has_mask;
1032
1033	DBG(("%s: setup id=%d\n", __FUNCTION__, id));
1034
1035	if (render->ve_id == id)
1036		return;
1037	render->ve_id = id;
1038
1039	/* The VUE layout
1040	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
1041	 *    dword 4-7: position (x, y, 1.0, 1.0),
1042	 *    dword 8-11: texture coordinate 0 (u0, v0, w0, 1.0)
1043	 *    dword 12-15: texture coordinate 1 (u1, v1, w1, 1.0)
1044	 *
1045	 * dword 4-15 are fetched from vertex buffer
1046	 */
1047	has_mask = (id >> 2) != 0;
1048	OUT_BATCH(GEN8_3DSTATE_VERTEX_ELEMENTS |
1049		((2 * (3 + has_mask)) + 1 - 2));
1050
1051	OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
1052		  SURFACEFORMAT_R32G32B32A32_FLOAT << VE_FORMAT_SHIFT |
1053		  0 << VE_OFFSET_SHIFT);
1054	OUT_BATCH(COMPONENT_STORE_0 << VE_COMPONENT_0_SHIFT |
1055		  COMPONENT_STORE_0 << VE_COMPONENT_1_SHIFT |
1056		  COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT |
1057		  COMPONENT_STORE_0 << VE_COMPONENT_3_SHIFT);
1058
1059	/* x,y */
1060	OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
1061		  SURFACEFORMAT_R16G16_SSCALED << VE_FORMAT_SHIFT |
1062		  0 << VE_OFFSET_SHIFT);
1063	OUT_BATCH(COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT |
1064		  COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT |
1065		  COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT |
1066		  COMPONENT_STORE_1_FLT << VE_COMPONENT_3_SHIFT);
1067
1068	/* u0, v0, w0 */
1069	DBG(("%s: first channel %d floats, offset=4\n", __FUNCTION__, id & 3));
1070	dw = COMPONENT_STORE_1_FLT << VE_COMPONENT_3_SHIFT;
1071	switch (id & 3) {
1072	default:
1073		assert(0);
1074	case 0:
1075		src_format = SURFACEFORMAT_R16G16_SSCALED;
1076		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1077		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1078		dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1079		break;
1080	case 1:
1081		src_format = SURFACEFORMAT_R32_FLOAT;
1082		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1083		dw |= COMPONENT_STORE_0 << VE_COMPONENT_1_SHIFT;
1084		dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1085		break;
1086	case 2:
1087		src_format = SURFACEFORMAT_R32G32_FLOAT;
1088		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1089		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1090		dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1091		break;
1092	case 3:
1093		src_format = SURFACEFORMAT_R32G32B32_FLOAT;
1094		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1095		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1096		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_2_SHIFT;
1097		break;
1098	}
1099	OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
1100		  src_format << VE_FORMAT_SHIFT |
1101		  4 << VE_OFFSET_SHIFT);
1102	OUT_BATCH(dw);
1103
1104	/* u1, v1, w1 */
1105	if (has_mask) {
1106		unsigned offset = 4 + ((id & 3) ?: 1) * sizeof(float);
1107		DBG(("%s: second channel %d floats, offset=%d\n", __FUNCTION__, (id >> 2) & 3, offset));
1108		dw = COMPONENT_STORE_1_FLT << VE_COMPONENT_3_SHIFT;
1109		switch (id >> 2) {
1110		case 1:
1111			src_format = SURFACEFORMAT_R32_FLOAT;
1112			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1113			dw |= COMPONENT_STORE_0 << VE_COMPONENT_1_SHIFT;
1114			dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1115			break;
1116		default:
1117			assert(0);
1118		case 2:
1119			src_format = SURFACEFORMAT_R32G32_FLOAT;
1120			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1121			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1122			dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1123			break;
1124		case 3:
1125			src_format = SURFACEFORMAT_R32G32B32_FLOAT;
1126			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1127			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1128			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_2_SHIFT;
1129			break;
1130		}
1131		OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
1132			  src_format << VE_FORMAT_SHIFT |
1133			  offset << VE_OFFSET_SHIFT);
1134		OUT_BATCH(dw);
1135	}
1136}
1137
1138inline static void
1139gen8_emit_pipe_invalidate(struct sna *sna)
1140{
1141	OUT_BATCH(GEN8_PIPE_CONTROL | (6 - 2));
1142	OUT_BATCH(PIPE_CONTROL_WC_FLUSH |
1143		  PIPE_CONTROL_TC_FLUSH |
1144		  PIPE_CONTROL_CS_STALL);
1145	OUT_BATCH64(0);
1146	OUT_BATCH64(0);
1147}
1148
1149inline static void
1150gen8_emit_pipe_flush(struct sna *sna, bool need_stall)
1151{
1152	unsigned stall;
1153
1154	stall = 0;
1155	if (need_stall)
1156		stall = (PIPE_CONTROL_CS_STALL |
1157			 PIPE_CONTROL_STALL_AT_SCOREBOARD);
1158
1159	OUT_BATCH(GEN8_PIPE_CONTROL | (6 - 2));
1160	OUT_BATCH(PIPE_CONTROL_WC_FLUSH | stall);
1161	OUT_BATCH64(0);
1162	OUT_BATCH64(0);
1163}
1164
1165inline static void
1166gen8_emit_pipe_stall(struct sna *sna)
1167{
1168	OUT_BATCH(GEN8_PIPE_CONTROL | (6 - 2));
1169	OUT_BATCH(PIPE_CONTROL_CS_STALL |
1170		  PIPE_CONTROL_STALL_AT_SCOREBOARD);
1171	OUT_BATCH64(0);
1172	OUT_BATCH64(0);
1173}
1174
1175static void
1176gen8_emit_state(struct sna *sna,
1177		const struct sna_composite_op *op,
1178		uint16_t wm_binding_table)
1179{
1180	bool need_invalidate;
1181	bool need_flush;
1182	bool need_stall;
1183
1184	assert(op->dst.bo->exec);
1185
1186	need_flush = wm_binding_table & 1 ||
1187		(sna->render_state.gen8.emit_flush && GEN8_READS_DST(op->u.gen8.flags));
1188	if (ALWAYS_FLUSH)
1189		need_flush = true;
1190
1191	wm_binding_table &= ~1;
1192
1193	need_stall = sna->render_state.gen8.surface_table != wm_binding_table;
1194
1195	need_invalidate = kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo);
1196	if (ALWAYS_INVALIDATE)
1197		need_invalidate = true;
1198
1199	need_stall &= gen8_emit_drawing_rectangle(sna, op);
1200	if (ALWAYS_STALL)
1201		need_stall = true;
1202
1203	if (need_invalidate) {
1204		gen8_emit_pipe_invalidate(sna);
1205		kgem_clear_dirty(&sna->kgem);
1206		assert(op->dst.bo->exec);
1207		kgem_bo_mark_dirty(op->dst.bo);
1208
1209		need_flush = false;
1210		need_stall = false;
1211	}
1212	if (need_flush) {
1213		gen8_emit_pipe_flush(sna, need_stall);
1214		need_stall = false;
1215	}
1216	if (need_stall)
1217		gen8_emit_pipe_stall(sna);
1218
1219	gen8_emit_cc(sna, GEN8_BLEND(op->u.gen8.flags));
1220	gen8_emit_sampler(sna, GEN8_SAMPLER(op->u.gen8.flags));
1221	gen8_emit_sf(sna, GEN8_VERTEX(op->u.gen8.flags) >> 2);
1222	gen8_emit_wm(sna, GEN8_KERNEL(op->u.gen8.flags));
1223	gen8_emit_vertex_elements(sna, op);
1224	gen8_emit_binding_table(sna, wm_binding_table);
1225
1226	sna->render_state.gen8.emit_flush = GEN8_READS_DST(op->u.gen8.flags);
1227}
1228
1229static bool gen8_magic_ca_pass(struct sna *sna,
1230			       const struct sna_composite_op *op)
1231{
1232	struct gen8_render_state *state = &sna->render_state.gen8;
1233
1234	if (!op->need_magic_ca_pass)
1235		return false;
1236
1237	DBG(("%s: CA fixup (%d -> %d)\n", __FUNCTION__,
1238	     sna->render.vertex_start, sna->render.vertex_index));
1239
1240	gen8_emit_pipe_stall(sna);
1241
1242	gen8_emit_cc(sna,
1243		     GEN8_BLEND(gen8_get_blend(PictOpAdd, true,
1244					       op->dst.format)));
1245	gen8_emit_wm(sna,
1246		     gen8_choose_composite_kernel(PictOpAdd,
1247						  true, true,
1248						  op->is_affine));
1249
1250	OUT_BATCH(GEN8_3DPRIMITIVE | (7 - 2));
1251	OUT_BATCH(0); /* ignored, see VF_TOPOLOGY */
1252	OUT_BATCH(sna->render.vertex_index - sna->render.vertex_start);
1253	OUT_BATCH(sna->render.vertex_start);
1254	OUT_BATCH(1);	/* single instance */
1255	OUT_BATCH(0);	/* start instance location */
1256	OUT_BATCH(0);	/* index buffer offset, ignored */
1257
1258	state->last_primitive = sna->kgem.nbatch;
1259	return true;
1260}
1261
1262static void null_create(struct sna_static_stream *stream)
1263{
1264	/* A bunch of zeros useful for legacy border color and depth-stencil */
1265	sna_static_stream_map(stream, 64, 64);
1266}
1267
1268static void
1269sampler_state_init(struct gen8_sampler_state *sampler_state,
1270		   sampler_filter_t filter,
1271		   sampler_extend_t extend)
1272{
1273	COMPILE_TIME_ASSERT(sizeof(*sampler_state) == 4*sizeof(uint32_t));
1274
1275	sampler_state->ss0.lod_preclamp = 2;	/* GL mode */
1276	sampler_state->ss0.default_color_mode = 1;
1277
1278	switch (filter) {
1279	default:
1280	case SAMPLER_FILTER_NEAREST:
1281		sampler_state->ss0.min_filter = MAPFILTER_NEAREST;
1282		sampler_state->ss0.mag_filter = MAPFILTER_NEAREST;
1283		break;
1284	case SAMPLER_FILTER_BILINEAR:
1285		sampler_state->ss0.min_filter = MAPFILTER_LINEAR;
1286		sampler_state->ss0.mag_filter = MAPFILTER_LINEAR;
1287		break;
1288	}
1289
1290	/* XXX bicubic filter using MAPFILTER_FLEXIBLE */
1291
1292	switch (extend) {
1293	default:
1294	case SAMPLER_EXTEND_NONE:
1295		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_CLAMP_BORDER;
1296		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_CLAMP_BORDER;
1297		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_CLAMP_BORDER;
1298		break;
1299	case SAMPLER_EXTEND_REPEAT:
1300		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_WRAP;
1301		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_WRAP;
1302		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_WRAP;
1303		break;
1304	case SAMPLER_EXTEND_PAD:
1305		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_CLAMP;
1306		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_CLAMP;
1307		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_CLAMP;
1308		break;
1309	case SAMPLER_EXTEND_REFLECT:
1310		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_MIRROR;
1311		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_MIRROR;
1312		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_MIRROR;
1313		break;
1314	}
1315}
1316
1317static void
1318sampler_copy_init(struct gen8_sampler_state *ss)
1319{
1320	sampler_state_init(ss, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
1321	ss->ss3.non_normalized_coord = 1;
1322
1323	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
1324}
1325
1326static void
1327sampler_fill_init(struct gen8_sampler_state *ss)
1328{
1329	sampler_state_init(ss, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_REPEAT);
1330	ss->ss3.non_normalized_coord = 1;
1331
1332	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
1333}
1334
1335static uint32_t
1336gen8_tiling_bits(uint32_t tiling)
1337{
1338	switch (tiling) {
1339	default: assert(0);
1340	case I915_TILING_NONE: return 0;
1341	case I915_TILING_X: return SURFACE_TILED;
1342	case I915_TILING_Y: return SURFACE_TILED | SURFACE_TILED_Y;
1343	}
1344}
1345
1346#define MOCS_WT (2 << 5)
1347#define MOCS_WB (3 << 5)
1348#define MOCS_eLLC_ONLY (0 << 3)
1349#define MOCS_LLC_ONLY (1 << 3)
1350#define MOCS_eLLC_LLC (2 << 3)
1351#define MOCS_ALL_CACHES (3 << 3)
1352
1353/**
1354 * Sets up the common fields for a surface state buffer for the given
1355 * picture in the given surface state buffer.
1356 */
1357static uint32_t
1358gen8_bind_bo(struct sna *sna,
1359	     struct kgem_bo *bo,
1360	     uint32_t width,
1361	     uint32_t height,
1362	     uint32_t format,
1363	     bool is_dst)
1364{
1365	uint32_t *ss;
1366	uint32_t domains;
1367	int offset;
1368	uint32_t is_scanout = is_dst && bo->scanout;
1369
1370	/* After the first bind, we manage the cache domains within the batch */
1371	offset = kgem_bo_get_binding(bo, format | is_dst << 30 | is_scanout << 31);
1372	if (offset) {
1373		if (is_dst)
1374			kgem_bo_mark_dirty(bo);
1375		assert(offset >= sna->kgem.surface);
1376		return offset * sizeof(uint32_t);
1377	}
1378
1379	offset = sna->kgem.surface -= SURFACE_DW;
1380	ss = sna->kgem.batch + offset;
1381	ss[0] = (SURFACE_2D << SURFACE_TYPE_SHIFT |
1382		 gen8_tiling_bits(bo->tiling) |
1383		 format << SURFACE_FORMAT_SHIFT |
1384		 SURFACE_VALIGN_4 | SURFACE_HALIGN_4);
1385	if (is_dst) {
1386		ss[0] |= SURFACE_RC_READ_WRITE;
1387		domains = I915_GEM_DOMAIN_RENDER << 16 |I915_GEM_DOMAIN_RENDER;
1388	} else
1389		domains = I915_GEM_DOMAIN_SAMPLER << 16;
1390	ss[1] = (is_dst && is_uncached(sna, bo)) ? 0 : is_scanout ? (MOCS_WT | MOCS_ALL_CACHES) << 24 : (MOCS_WB | MOCS_ALL_CACHES) << 24;
1391	ss[2] = ((width - 1)  << SURFACE_WIDTH_SHIFT |
1392		 (height - 1) << SURFACE_HEIGHT_SHIFT);
1393	ss[3] = (bo->pitch - 1) << SURFACE_PITCH_SHIFT;
1394	ss[4] = 0;
1395	ss[5] = 0;
1396	ss[6] = 0;
1397	ss[7] = SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
1398	*(uint64_t *)(ss+8) = kgem_add_reloc64(&sna->kgem, offset + 8, bo, domains, 0);
1399	ss[10] = 0;
1400	ss[11] = 0;
1401	ss[12] = 0;
1402	ss[13] = 0;
1403	ss[14] = 0;
1404	ss[15] = 0;
1405
1406	kgem_bo_set_binding(bo, format | is_dst << 30 | is_scanout << 31, offset);
1407
1408	DBG(("[%x] bind bo(handle=%d, addr=%lx), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
1409	     offset, bo->handle, *(uint64_t *)(ss+8),
1410	     format, width, height, bo->pitch, bo->tiling,
1411	     domains & 0xffff ? "render" : "sampler"));
1412
1413	return offset * sizeof(uint32_t);
1414}
1415
1416static void gen8_emit_vertex_buffer(struct sna *sna,
1417				    const struct sna_composite_op *op)
1418{
1419	int id = GEN8_VERTEX(op->u.gen8.flags);
1420
1421	OUT_BATCH(GEN8_3DSTATE_VERTEX_BUFFERS | (5 - 2));
1422	OUT_BATCH(id << VB_INDEX_SHIFT | VB_MODIFY_ENABLE |
1423		  4*op->floats_per_vertex);
1424	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
1425	OUT_BATCH64(0);
1426	OUT_BATCH(~0); /* buffer size: disabled */
1427
1428	sna->render.vb_id |= 1 << id;
1429}
1430
1431static void gen8_emit_primitive(struct sna *sna)
1432{
1433	if (sna->kgem.nbatch == sna->render_state.gen8.last_primitive) {
1434		sna->render.vertex_offset = sna->kgem.nbatch - 5;
1435		return;
1436	}
1437
1438	OUT_BATCH(GEN8_3DPRIMITIVE | (7 - 2));
1439	OUT_BATCH(0); /* ignored, see VF_TOPOLOGY */
1440	sna->render.vertex_offset = sna->kgem.nbatch;
1441	OUT_BATCH(0);	/* vertex count, to be filled in later */
1442	OUT_BATCH(sna->render.vertex_index);
1443	OUT_BATCH(1);	/* single instance */
1444	OUT_BATCH(0);	/* start instance location */
1445	OUT_BATCH(0);	/* index buffer offset, ignored */
1446	sna->render.vertex_start = sna->render.vertex_index;
1447
1448	sna->render_state.gen8.last_primitive = sna->kgem.nbatch;
1449}
1450
1451static bool gen8_rectangle_begin(struct sna *sna,
1452				 const struct sna_composite_op *op)
1453{
1454	int id = 1 << GEN8_VERTEX(op->u.gen8.flags);
1455	int ndwords;
1456
1457	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
1458		return true;
1459
1460	ndwords = op->need_magic_ca_pass ? 60 : 6;
1461	if ((sna->render.vb_id & id) == 0)
1462		ndwords += 5;
1463	if (!kgem_check_batch(&sna->kgem, ndwords))
1464		return false;
1465
1466	if ((sna->render.vb_id & id) == 0)
1467		gen8_emit_vertex_buffer(sna, op);
1468
1469	gen8_emit_primitive(sna);
1470	return true;
1471}
1472
1473static int gen8_get_rectangles__flush(struct sna *sna,
1474				      const struct sna_composite_op *op)
1475{
1476	/* Preventing discarding new vbo after lock contention */
1477	if (sna_vertex_wait__locked(&sna->render)) {
1478		int rem = vertex_space(sna);
1479		if (rem > op->floats_per_rect)
1480			return rem;
1481	}
1482
1483	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 65 : 6))
1484		return 0;
1485	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
1486		return 0;
1487
1488	if (sna->render.vertex_offset) {
1489		gen8_vertex_flush(sna);
1490		if (gen8_magic_ca_pass(sna, op)) {
1491			gen8_emit_pipe_invalidate(sna);
1492			gen8_emit_cc(sna, GEN8_BLEND(op->u.gen8.flags));
1493			gen8_emit_wm(sna, GEN8_KERNEL(op->u.gen8.flags));
1494		}
1495	}
1496
1497	return gen8_vertex_finish(sna);
1498}
1499
1500inline static int gen8_get_rectangles(struct sna *sna,
1501				      const struct sna_composite_op *op,
1502				      int want,
1503				      void (*emit_state)(struct sna *sna, const struct sna_composite_op *op))
1504{
1505	int rem;
1506
1507	assert(want);
1508
1509start:
1510	rem = vertex_space(sna);
1511	if (unlikely(rem < op->floats_per_rect)) {
1512		DBG(("flushing vbo for %s: %d < %d\n",
1513		     __FUNCTION__, rem, op->floats_per_rect));
1514		rem = gen8_get_rectangles__flush(sna, op);
1515		if (unlikely(rem == 0))
1516			goto flush;
1517	}
1518
1519	if (unlikely(sna->render.vertex_offset == 0)) {
1520		if (!gen8_rectangle_begin(sna, op))
1521			goto flush;
1522		else
1523			goto start;
1524	}
1525
1526	assert(rem <= vertex_space(sna));
1527	assert(op->floats_per_rect <= rem);
1528	if (want > 1 && want * op->floats_per_rect > rem)
1529		want = rem / op->floats_per_rect;
1530
1531	assert(want > 0);
1532	sna->render.vertex_index += 3*want;
1533	return want;
1534
1535flush:
1536	if (sna->render.vertex_offset) {
1537		gen8_vertex_flush(sna);
1538		gen8_magic_ca_pass(sna, op);
1539	}
1540	sna_vertex_wait__locked(&sna->render);
1541	_kgem_submit(&sna->kgem);
1542	emit_state(sna, op);
1543	goto start;
1544}
1545
1546inline static uint32_t *gen8_composite_get_binding_table(struct sna *sna,
1547							 uint16_t *offset)
1548{
1549	uint32_t *table;
1550
1551	assert(sna->kgem.surface <= 16384);
1552	sna->kgem.surface -= SURFACE_DW;
1553	/* Clear all surplus entries to zero in case of prefetch */
1554	table = memset(sna->kgem.batch + sna->kgem.surface, 0, 64);
1555
1556	DBG(("%s(%x)\n", __FUNCTION__, 4*sna->kgem.surface));
1557
1558	*offset = sna->kgem.surface;
1559	return table;
1560}
1561
1562static void
1563gen8_get_batch(struct sna *sna, const struct sna_composite_op *op)
1564{
1565	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
1566
1567	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 2*(1+3))) {
1568		DBG(("%s: flushing batch: %d < %d+%d\n",
1569		     __FUNCTION__, sna->kgem.surface - sna->kgem.nbatch,
1570		     150, 4*8*2));
1571		_kgem_submit(&sna->kgem);
1572		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
1573	}
1574
1575	assert(sna->kgem.mode == KGEM_RENDER);
1576	assert(sna->kgem.ring == KGEM_RENDER);
1577
1578	if (sna->render_state.gen8.needs_invariant)
1579		gen8_emit_invariant(sna);
1580}
1581
1582static void gen8_emit_composite_state(struct sna *sna,
1583				      const struct sna_composite_op *op)
1584{
1585	uint32_t *binding_table;
1586	uint16_t offset, dirty;
1587
1588	gen8_get_batch(sna, op);
1589
1590	binding_table = gen8_composite_get_binding_table(sna, &offset);
1591
1592	dirty = kgem_bo_is_dirty(op->dst.bo);
1593
1594	binding_table[0] =
1595		gen8_bind_bo(sna,
1596			    op->dst.bo, op->dst.width, op->dst.height,
1597			    gen8_get_dest_format(op->dst.format),
1598			    true);
1599	binding_table[1] =
1600		gen8_bind_bo(sna,
1601			     op->src.bo, op->src.width, op->src.height,
1602			     op->src.card_format,
1603			     false);
1604	if (op->mask.bo) {
1605		binding_table[2] =
1606			gen8_bind_bo(sna,
1607				     op->mask.bo,
1608				     op->mask.width,
1609				     op->mask.height,
1610				     op->mask.card_format,
1611				     false);
1612	}
1613
1614	if (sna->kgem.surface == offset &&
1615	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen8.surface_table) == *(uint64_t*)binding_table &&
1616	    (op->mask.bo == NULL ||
1617	     sna->kgem.batch[sna->render_state.gen8.surface_table+2] == binding_table[2])) {
1618		sna->kgem.surface += SURFACE_DW;
1619		offset = sna->render_state.gen8.surface_table;
1620	}
1621
1622	if (sna->kgem.batch[sna->render_state.gen8.surface_table] == binding_table[0])
1623		dirty = 0;
1624
1625	gen8_emit_state(sna, op, offset | dirty);
1626}
1627
1628static void
1629gen8_align_vertex(struct sna *sna, const struct sna_composite_op *op)
1630{
1631	if (op->floats_per_vertex != sna->render_state.gen8.floats_per_vertex) {
1632		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
1633		     sna->render_state.gen8.floats_per_vertex, op->floats_per_vertex));
1634		gen8_vertex_align(sna, op);
1635		sna->render_state.gen8.floats_per_vertex = op->floats_per_vertex;
1636	}
1637}
1638
1639fastcall static void
1640gen8_render_composite_blt(struct sna *sna,
1641			  const struct sna_composite_op *op,
1642			  const struct sna_composite_rectangles *r)
1643{
1644	gen8_get_rectangles(sna, op, 1, gen8_emit_composite_state);
1645	op->prim_emit(sna, op, r);
1646}
1647
1648fastcall static void
1649gen8_render_composite_box(struct sna *sna,
1650			  const struct sna_composite_op *op,
1651			  const BoxRec *box)
1652{
1653	struct sna_composite_rectangles r;
1654
1655	gen8_get_rectangles(sna, op, 1, gen8_emit_composite_state);
1656
1657	DBG(("  %s: (%d, %d), (%d, %d)\n",
1658	     __FUNCTION__,
1659	     box->x1, box->y1, box->x2, box->y2));
1660
1661	r.dst.x = box->x1;
1662	r.dst.y = box->y1;
1663	r.width  = box->x2 - box->x1;
1664	r.height = box->y2 - box->y1;
1665	r.src = r.mask = r.dst;
1666
1667	op->prim_emit(sna, op, &r);
1668}
1669
1670static void
1671gen8_render_composite_boxes__blt(struct sna *sna,
1672				 const struct sna_composite_op *op,
1673				 const BoxRec *box, int nbox)
1674{
1675	DBG(("composite_boxes(%d)\n", nbox));
1676
1677	do {
1678		int nbox_this_time;
1679
1680		nbox_this_time = gen8_get_rectangles(sna, op, nbox,
1681						     gen8_emit_composite_state);
1682		nbox -= nbox_this_time;
1683
1684		do {
1685			struct sna_composite_rectangles r;
1686
1687			DBG(("  %s: (%d, %d), (%d, %d)\n",
1688			     __FUNCTION__,
1689			     box->x1, box->y1, box->x2, box->y2));
1690
1691			r.dst.x = box->x1;
1692			r.dst.y = box->y1;
1693			r.width  = box->x2 - box->x1;
1694			r.height = box->y2 - box->y1;
1695			r.src = r.mask = r.dst;
1696
1697			op->prim_emit(sna, op, &r);
1698			box++;
1699		} while (--nbox_this_time);
1700	} while (nbox);
1701}
1702
1703static void
1704gen8_render_composite_boxes(struct sna *sna,
1705			    const struct sna_composite_op *op,
1706			    const BoxRec *box, int nbox)
1707{
1708	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1709
1710	do {
1711		int nbox_this_time;
1712		float *v;
1713
1714		nbox_this_time = gen8_get_rectangles(sna, op, nbox,
1715						     gen8_emit_composite_state);
1716		assert(nbox_this_time);
1717		nbox -= nbox_this_time;
1718
1719		v = sna->render.vertices + sna->render.vertex_used;
1720		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1721
1722		op->emit_boxes(op, box, nbox_this_time, v);
1723		box += nbox_this_time;
1724	} while (nbox);
1725}
1726
1727static void
1728gen8_render_composite_boxes__thread(struct sna *sna,
1729				    const struct sna_composite_op *op,
1730				    const BoxRec *box, int nbox)
1731{
1732	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1733
1734	sna_vertex_lock(&sna->render);
1735	do {
1736		int nbox_this_time;
1737		float *v;
1738
1739		nbox_this_time = gen8_get_rectangles(sna, op, nbox,
1740						     gen8_emit_composite_state);
1741		assert(nbox_this_time);
1742		nbox -= nbox_this_time;
1743
1744		v = sna->render.vertices + sna->render.vertex_used;
1745		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1746
1747		sna_vertex_acquire__locked(&sna->render);
1748		sna_vertex_unlock(&sna->render);
1749
1750		op->emit_boxes(op, box, nbox_this_time, v);
1751		box += nbox_this_time;
1752
1753		sna_vertex_lock(&sna->render);
1754		sna_vertex_release__locked(&sna->render);
1755	} while (nbox);
1756	sna_vertex_unlock(&sna->render);
1757}
1758
1759static uint32_t
1760gen8_create_blend_state(struct sna_static_stream *stream)
1761{
1762	char *base, *ptr;
1763	int src, dst;
1764
1765	COMPILE_TIME_ASSERT(((GEN8_BLENDFACTOR_COUNT * GEN8_BLENDFACTOR_COUNT << 4) & (1 << 15)) == 0);
1766
1767	base = sna_static_stream_map(stream,
1768				     GEN8_BLENDFACTOR_COUNT * GEN8_BLENDFACTOR_COUNT * GEN8_BLEND_STATE_PADDED_SIZE,
1769				     64);
1770
1771	ptr = base;
1772	for (src = 0; src < GEN8_BLENDFACTOR_COUNT; src++) {
1773		for (dst = 0; dst < GEN8_BLENDFACTOR_COUNT; dst++) {
1774			struct gen8_blend_state *blend =
1775				(struct gen8_blend_state *)ptr;
1776
1777			assert(((ptr - base) & 63) == 0);
1778			COMPILE_TIME_ASSERT(sizeof(blend->common) == 4);
1779			COMPILE_TIME_ASSERT(sizeof(blend->rt) == 8);
1780			COMPILE_TIME_ASSERT((char *)&blend->rt - (char *)blend == 4);
1781
1782			blend->rt.post_blend_clamp = 1;
1783			blend->rt.pre_blend_clamp = 1;
1784
1785			blend->rt.color_blend =
1786				!(dst == BLENDFACTOR_ZERO && src == BLENDFACTOR_ONE);
1787			blend->rt.dest_blend_factor = dst;
1788			blend->rt.source_blend_factor = src;
1789			blend->rt.color_blend_function = BLENDFUNCTION_ADD;
1790
1791			blend->rt.dest_alpha_blend_factor = dst;
1792			blend->rt.source_alpha_blend_factor = src;
1793			blend->rt.alpha_blend_function = BLENDFUNCTION_ADD;
1794
1795			ptr += GEN8_BLEND_STATE_PADDED_SIZE;
1796		}
1797	}
1798
1799	return sna_static_stream_offsetof(stream, base);
1800}
1801
1802static int
1803gen8_composite_picture(struct sna *sna,
1804		       PicturePtr picture,
1805		       struct sna_composite_channel *channel,
1806		       int x, int y,
1807		       int w, int h,
1808		       int dst_x, int dst_y,
1809		       bool precise)
1810{
1811	PixmapPtr pixmap;
1812	uint32_t color;
1813	int16_t dx, dy;
1814
1815	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
1816	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
1817
1818	channel->is_solid = false;
1819	channel->card_format = -1;
1820
1821	if (sna_picture_is_solid(picture, &color))
1822		return gen4_channel_init_solid(sna, channel, color);
1823
1824	if (picture->pDrawable == NULL) {
1825		int ret;
1826
1827		if (picture->pSourcePict->type == SourcePictTypeLinear)
1828			return gen4_channel_init_linear(sna, picture, channel,
1829							x, y,
1830							w, h,
1831							dst_x, dst_y);
1832
1833		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
1834		ret = -1;
1835		if (!precise)
1836			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
1837								      x, y, w, h, dst_x, dst_y);
1838		if (ret == -1)
1839			ret = sna_render_picture_fixup(sna, picture, channel,
1840						       x, y, w, h, dst_x, dst_y);
1841		return ret;
1842	}
1843
1844	if (picture->alphaMap) {
1845		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
1846		return sna_render_picture_fixup(sna, picture, channel,
1847						x, y, w, h, dst_x, dst_y);
1848	}
1849
1850	if (!gen8_check_repeat(picture))
1851		return sna_render_picture_fixup(sna, picture, channel,
1852						x, y, w, h, dst_x, dst_y);
1853
1854	if (!gen8_check_filter(picture))
1855		return sna_render_picture_fixup(sna, picture, channel,
1856						x, y, w, h, dst_x, dst_y);
1857
1858	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
1859	channel->filter = picture->filter;
1860
1861	pixmap = get_drawable_pixmap(picture->pDrawable);
1862	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
1863
1864	x += dx + picture->pDrawable->x;
1865	y += dy + picture->pDrawable->y;
1866
1867	channel->is_affine = sna_transform_is_affine(picture->transform);
1868	if (sna_transform_is_imprecise_integer_translation(picture->transform, picture->filter, precise, &dx, &dy)) {
1869		DBG(("%s: integer translation (%d, %d), removing\n",
1870		     __FUNCTION__, dx, dy));
1871		x += dx;
1872		y += dy;
1873		channel->transform = NULL;
1874		channel->filter = PictFilterNearest;
1875
1876		if (channel->repeat ||
1877		    (x >= 0 &&
1878		     y >= 0 &&
1879		     x + w < pixmap->drawable.width &&
1880		     y + h < pixmap->drawable.height)) {
1881			struct sna_pixmap *priv = sna_pixmap(pixmap);
1882			if (priv && priv->clear) {
1883				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
1884				return gen4_channel_init_solid(sna, channel, priv->clear_color);
1885			}
1886		}
1887	} else
1888		channel->transform = picture->transform;
1889
1890	channel->pict_format = picture->format;
1891	channel->card_format = gen8_get_card_format(picture->format);
1892	if (channel->card_format == (unsigned)-1)
1893		return sna_render_picture_convert(sna, picture, channel, pixmap,
1894						  x, y, w, h, dst_x, dst_y,
1895						  false);
1896
1897	if (too_large(pixmap->drawable.width, pixmap->drawable.height)) {
1898		DBG(("%s: extracting from pixmap %dx%d\n", __FUNCTION__,
1899		     pixmap->drawable.width, pixmap->drawable.height));
1900		return sna_render_picture_extract(sna, picture, channel,
1901						  x, y, w, h, dst_x, dst_y);
1902	}
1903
1904	return sna_render_pixmap_bo(sna, channel, pixmap,
1905				    x, y, w, h, dst_x, dst_y);
1906}
1907
1908inline static bool gen8_composite_channel_convert(struct sna_composite_channel *channel)
1909{
1910	if (unaligned(channel->bo, PICT_FORMAT_BPP(channel->pict_format)))
1911		return false;
1912
1913	channel->repeat = gen8_repeat(channel->repeat);
1914	channel->filter = gen8_filter(channel->filter);
1915	if (channel->card_format == (unsigned)-1)
1916		channel->card_format = gen8_get_card_format(channel->pict_format);
1917	assert(channel->card_format != (unsigned)-1);
1918
1919	return true;
1920}
1921
1922static void gen8_render_composite_done(struct sna *sna,
1923				       const struct sna_composite_op *op)
1924{
1925	if (sna->render.vertex_offset) {
1926		gen8_vertex_flush(sna);
1927		gen8_magic_ca_pass(sna, op);
1928	}
1929
1930	if (op->mask.bo)
1931		kgem_bo_destroy(&sna->kgem, op->mask.bo);
1932	if (op->src.bo)
1933		kgem_bo_destroy(&sna->kgem, op->src.bo);
1934
1935	sna_render_composite_redirect_done(sna, op);
1936}
1937
1938inline static bool
1939gen8_composite_set_target(struct sna *sna,
1940			  struct sna_composite_op *op,
1941			  PicturePtr dst,
1942			  int x, int y, int w, int h,
1943			  bool partial)
1944{
1945	BoxRec box;
1946	unsigned int hint;
1947
1948	DBG(("%s: (%d, %d)x(%d, %d), partial?=%d\n", __FUNCTION__, x, y, w, h, partial));
1949
1950	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
1951	op->dst.format = dst->format;
1952	op->dst.width  = op->dst.pixmap->drawable.width;
1953	op->dst.height = op->dst.pixmap->drawable.height;
1954
1955	if (w | h) {
1956		assert(w && h);
1957		box.x1 = x;
1958		box.y1 = y;
1959		box.x2 = x + w;
1960		box.y2 = y + h;
1961	} else
1962		sna_render_picture_extents(dst, &box);
1963
1964	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
1965	if (!partial) {
1966		hint |= IGNORE_DAMAGE;
1967		if (w == op->dst.width && h == op->dst.height)
1968			hint |= REPLACES;
1969	}
1970
1971	op->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint, &box, &op->damage);
1972	if (op->dst.bo == NULL)
1973		return false;
1974
1975	if (unaligned(op->dst.bo, dst->pDrawable->bitsPerPixel))
1976		return false;
1977
1978	if (hint & REPLACES) {
1979		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
1980		kgem_bo_pair_undo(&sna->kgem, priv->gpu_bo, priv->cpu_bo);
1981	}
1982
1983	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
1984			    &op->dst.x, &op->dst.y);
1985
1986	DBG(("%s: pixmap=%ld, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
1987	     __FUNCTION__,
1988	     op->dst.pixmap->drawable.serialNumber, (int)op->dst.format,
1989	     op->dst.width, op->dst.height,
1990	     op->dst.bo->pitch,
1991	     op->dst.x, op->dst.y,
1992	     op->damage ? *op->damage : (void *)-1));
1993
1994	assert(op->dst.bo->proxy == NULL);
1995
1996	if (too_large(op->dst.width, op->dst.height) &&
1997	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
1998		return false;
1999
2000	return true;
2001}
2002
2003static bool
2004try_blt(struct sna *sna,
2005	PicturePtr dst, PicturePtr src,
2006	int width, int height)
2007{
2008	struct kgem_bo *bo;
2009
2010	if (sna->kgem.mode == KGEM_BLT) {
2011		DBG(("%s: already performing BLT\n", __FUNCTION__));
2012		return true;
2013	}
2014
2015	if (too_large(width, height)) {
2016		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
2017		     __FUNCTION__, width, height));
2018		return true;
2019	}
2020
2021	bo = __sna_drawable_peek_bo(dst->pDrawable);
2022	if (bo == NULL)
2023		return true;
2024	if (bo->rq)
2025		return RQ_IS_BLT(bo->rq);
2026
2027	if (sna_picture_is_solid(src, NULL) && can_switch_to_blt(sna, bo, 0))
2028		return true;
2029
2030	if (src->pDrawable) {
2031		bo = __sna_drawable_peek_bo(src->pDrawable);
2032		if (bo == NULL)
2033			return true;
2034
2035		if (prefer_blt_bo(sna, bo))
2036			return RQ_IS_BLT(bo->rq);
2037	}
2038
2039	if (sna->kgem.ring == KGEM_BLT) {
2040		DBG(("%s: already performing BLT\n", __FUNCTION__));
2041		return true;
2042	}
2043
2044	return false;
2045}
2046
2047static bool
2048check_gradient(PicturePtr picture, bool precise)
2049{
2050	if (picture->pDrawable)
2051		return false;
2052
2053	switch (picture->pSourcePict->type) {
2054	case SourcePictTypeSolidFill:
2055	case SourcePictTypeLinear:
2056		return false;
2057	default:
2058		return precise;
2059	}
2060}
2061
2062static bool
2063has_alphamap(PicturePtr p)
2064{
2065	return p->alphaMap != NULL;
2066}
2067
2068static bool
2069need_upload(PicturePtr p)
2070{
2071	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
2072}
2073
2074static bool
2075source_is_busy(PixmapPtr pixmap)
2076{
2077	struct sna_pixmap *priv = sna_pixmap(pixmap);
2078	if (priv == NULL || priv->clear)
2079		return false;
2080
2081	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
2082		return true;
2083
2084	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
2085		return true;
2086
2087	return priv->gpu_damage && !priv->cpu_damage;
2088}
2089
2090static bool
2091source_fallback(PicturePtr p, PixmapPtr pixmap, bool precise)
2092{
2093	if (sna_picture_is_solid(p, NULL))
2094		return false;
2095
2096	if (p->pSourcePict)
2097		return check_gradient(p, precise);
2098
2099	if (!gen8_check_repeat(p) || !gen8_check_format(p->format))
2100		return true;
2101
2102	if (pixmap && source_is_busy(pixmap))
2103		return false;
2104
2105	return has_alphamap(p) || !gen8_check_filter(p) || need_upload(p);
2106}
2107
2108static bool
2109gen8_composite_fallback(struct sna *sna,
2110			PicturePtr src,
2111			PicturePtr mask,
2112			PicturePtr dst)
2113{
2114	PixmapPtr src_pixmap;
2115	PixmapPtr mask_pixmap;
2116	PixmapPtr dst_pixmap;
2117	bool src_fallback, mask_fallback;
2118
2119	if (!gen8_check_dst_format(dst->format)) {
2120		DBG(("%s: unknown destination format: %d\n",
2121		     __FUNCTION__, dst->format));
2122		return true;
2123	}
2124
2125	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
2126
2127	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
2128	src_fallback = source_fallback(src, src_pixmap,
2129				       dst->polyMode == PolyModePrecise);
2130
2131	if (mask) {
2132		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
2133		mask_fallback = source_fallback(mask, mask_pixmap,
2134						dst->polyMode == PolyModePrecise);
2135	} else {
2136		mask_pixmap = NULL;
2137		mask_fallback = false;
2138	}
2139
2140	/* If we are using the destination as a source and need to
2141	 * readback in order to upload the source, do it all
2142	 * on the cpu.
2143	 */
2144	if (src_pixmap == dst_pixmap && src_fallback) {
2145		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
2146		return true;
2147	}
2148	if (mask_pixmap == dst_pixmap && mask_fallback) {
2149		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
2150		return true;
2151	}
2152
2153	/* If anything is on the GPU, push everything out to the GPU */
2154	if (dst_use_gpu(dst_pixmap)) {
2155		DBG(("%s: dst is already on the GPU, try to use GPU\n",
2156		     __FUNCTION__));
2157		return false;
2158	}
2159
2160	if (src_pixmap && !src_fallback) {
2161		DBG(("%s: src is already on the GPU, try to use GPU\n",
2162		     __FUNCTION__));
2163		return false;
2164	}
2165	if (mask_pixmap && !mask_fallback) {
2166		DBG(("%s: mask is already on the GPU, try to use GPU\n",
2167		     __FUNCTION__));
2168		return false;
2169	}
2170
2171	/* However if the dst is not on the GPU and we need to
2172	 * render one of the sources using the CPU, we may
2173	 * as well do the entire operation in place onthe CPU.
2174	 */
2175	if (src_fallback) {
2176		DBG(("%s: dst is on the CPU and src will fallback\n",
2177		     __FUNCTION__));
2178		return true;
2179	}
2180
2181	if (mask && mask_fallback) {
2182		DBG(("%s: dst is on the CPU and mask will fallback\n",
2183		     __FUNCTION__));
2184		return true;
2185	}
2186
2187	if (too_large(dst_pixmap->drawable.width,
2188		      dst_pixmap->drawable.height) &&
2189	    dst_is_cpu(dst_pixmap)) {
2190		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
2191		return true;
2192	}
2193
2194	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
2195	     __FUNCTION__));
2196	return dst_use_cpu(dst_pixmap);
2197}
2198
2199static int
2200reuse_source(struct sna *sna,
2201	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
2202	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
2203{
2204	uint32_t color;
2205
2206	if (src_x != msk_x || src_y != msk_y)
2207		return false;
2208
2209	if (src == mask) {
2210		DBG(("%s: mask is source\n", __FUNCTION__));
2211		*mc = *sc;
2212		mc->bo = kgem_bo_reference(mc->bo);
2213		return true;
2214	}
2215
2216	if (sna_picture_is_solid(mask, &color))
2217		return gen4_channel_init_solid(sna, mc, color);
2218
2219	if (sc->is_solid)
2220		return false;
2221
2222	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
2223		return false;
2224
2225	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
2226
2227	if (!sna_transform_equal(src->transform, mask->transform))
2228		return false;
2229
2230	if (!sna_picture_alphamap_equal(src, mask))
2231		return false;
2232
2233	if (!gen8_check_repeat(mask))
2234		return false;
2235
2236	if (!gen8_check_filter(mask))
2237		return false;
2238
2239	if (!gen8_check_format(mask->format))
2240		return false;
2241
2242	DBG(("%s: reusing source channel for mask with a twist\n",
2243	     __FUNCTION__));
2244
2245	*mc = *sc;
2246	mc->repeat = gen8_repeat(mask->repeat ? mask->repeatType : RepeatNone);
2247	mc->filter = gen8_filter(mask->filter);
2248	mc->pict_format = mask->format;
2249	mc->card_format = gen8_get_card_format(mask->format);
2250	mc->bo = kgem_bo_reference(mc->bo);
2251	return true;
2252}
2253
2254static bool
2255gen8_render_composite(struct sna *sna,
2256		      uint8_t op,
2257		      PicturePtr src,
2258		      PicturePtr mask,
2259		      PicturePtr dst,
2260		      int16_t src_x, int16_t src_y,
2261		      int16_t msk_x, int16_t msk_y,
2262		      int16_t dst_x, int16_t dst_y,
2263		      int16_t width, int16_t height,
2264		      unsigned flags,
2265		      struct sna_composite_op *tmp)
2266{
2267	if (op >= ARRAY_SIZE(gen8_blend_op))
2268		return false;
2269
2270	DBG(("%s: %dx%d, current mode=%d/%d\n", __FUNCTION__,
2271	     width, height, sna->kgem.mode, sna->kgem.ring));
2272
2273	if (mask == NULL &&
2274	    try_blt(sna, dst, src, width, height) &&
2275	    sna_blt_composite(sna, op,
2276			      src, dst,
2277			      src_x, src_y,
2278			      dst_x, dst_y,
2279			      width, height,
2280			      flags, tmp))
2281		return true;
2282
2283	if (gen8_composite_fallback(sna, src, mask, dst))
2284		goto fallback;
2285
2286	if (need_tiling(sna, width, height))
2287		return sna_tiling_composite(op, src, mask, dst,
2288					    src_x, src_y,
2289					    msk_x, msk_y,
2290					    dst_x, dst_y,
2291					    width, height,
2292					    tmp);
2293
2294	if (op == PictOpClear && src == sna->clear)
2295		op = PictOpSrc;
2296	tmp->op = op;
2297	if (!gen8_composite_set_target(sna, tmp, dst,
2298				       dst_x, dst_y, width, height,
2299				       flags & COMPOSITE_PARTIAL || op > PictOpSrc))
2300		goto fallback;
2301
2302	switch (gen8_composite_picture(sna, src, &tmp->src,
2303				       src_x, src_y,
2304				       width, height,
2305				       dst_x, dst_y,
2306				       dst->polyMode == PolyModePrecise)) {
2307	case -1:
2308		goto cleanup_dst;
2309	case 0:
2310		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
2311			goto cleanup_dst;
2312		/* fall through to fixup */
2313	case 1:
2314		/* Did we just switch rings to prepare the source? */
2315		if (mask == NULL &&
2316		    (prefer_blt_composite(sna, tmp) ||
2317		     unaligned(tmp->src.bo, PICT_FORMAT_BPP(tmp->src.pict_format))) &&
2318		    sna_blt_composite__convert(sna,
2319					       dst_x, dst_y, width, height,
2320					       tmp))
2321			return true;
2322
2323		if (!gen8_composite_channel_convert(&tmp->src))
2324			goto cleanup_src;
2325
2326		break;
2327	}
2328
2329	tmp->is_affine = tmp->src.is_affine;
2330	tmp->has_component_alpha = false;
2331	tmp->need_magic_ca_pass = false;
2332
2333	tmp->mask.bo = NULL;
2334	tmp->mask.filter = SAMPLER_FILTER_NEAREST;
2335	tmp->mask.repeat = SAMPLER_EXTEND_NONE;
2336
2337	if (mask) {
2338		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
2339			tmp->has_component_alpha = true;
2340
2341			/* Check if it's component alpha that relies on a source alpha and on
2342			 * the source value.  We can only get one of those into the single
2343			 * source value that we get to blend with.
2344			 */
2345			if (gen8_blend_op[op].src_alpha &&
2346			    (gen8_blend_op[op].src_blend != BLENDFACTOR_ZERO)) {
2347				if (op != PictOpOver)
2348					goto cleanup_src;
2349
2350				tmp->need_magic_ca_pass = true;
2351				tmp->op = PictOpOutReverse;
2352			}
2353		}
2354
2355		if (!reuse_source(sna,
2356				  src, &tmp->src, src_x, src_y,
2357				  mask, &tmp->mask, msk_x, msk_y)) {
2358			switch (gen8_composite_picture(sna, mask, &tmp->mask,
2359						       msk_x, msk_y,
2360						       width, height,
2361						       dst_x, dst_y,
2362						       dst->polyMode == PolyModePrecise)) {
2363			case -1:
2364				goto cleanup_src;
2365			case 0:
2366				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
2367					goto cleanup_src;
2368				/* fall through to fixup */
2369			case 1:
2370				if (!gen8_composite_channel_convert(&tmp->mask))
2371					goto cleanup_mask;
2372				break;
2373			}
2374		}
2375
2376		tmp->is_affine &= tmp->mask.is_affine;
2377	}
2378
2379	tmp->u.gen8.flags =
2380		GEN8_SET_FLAGS(SAMPLER_OFFSET(tmp->src.filter,
2381					      tmp->src.repeat,
2382					      tmp->mask.filter,
2383					      tmp->mask.repeat),
2384			       gen8_get_blend(tmp->op,
2385					      tmp->has_component_alpha,
2386					      tmp->dst.format),
2387			       gen8_choose_composite_kernel(tmp->op,
2388							    tmp->mask.bo != NULL,
2389							    tmp->has_component_alpha,
2390							    tmp->is_affine),
2391			       gen4_choose_composite_emitter(sna, tmp));
2392
2393	tmp->blt   = gen8_render_composite_blt;
2394	tmp->box   = gen8_render_composite_box;
2395	tmp->boxes = gen8_render_composite_boxes__blt;
2396	if (tmp->emit_boxes){
2397		tmp->boxes = gen8_render_composite_boxes;
2398		tmp->thread_boxes = gen8_render_composite_boxes__thread;
2399	}
2400	tmp->done  = gen8_render_composite_done;
2401
2402	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
2403	if (!kgem_check_bo(&sna->kgem,
2404			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2405			   NULL)) {
2406		kgem_submit(&sna->kgem);
2407		if (!kgem_check_bo(&sna->kgem,
2408				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2409				   NULL))
2410			goto cleanup_mask;
2411		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
2412	}
2413
2414	gen8_align_vertex(sna, tmp);
2415	gen8_emit_composite_state(sna, tmp);
2416	return true;
2417
2418cleanup_mask:
2419	if (tmp->mask.bo) {
2420		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
2421		tmp->mask.bo = NULL;
2422	}
2423cleanup_src:
2424	if (tmp->src.bo) {
2425		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
2426		tmp->src.bo = NULL;
2427	}
2428cleanup_dst:
2429	if (tmp->redirect.real_bo) {
2430		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
2431		tmp->redirect.real_bo = NULL;
2432	}
2433fallback:
2434	return (mask == NULL &&
2435		sna_blt_composite(sna, op,
2436				  src, dst,
2437				  src_x, src_y,
2438				  dst_x, dst_y,
2439				  width, height,
2440				  flags | COMPOSITE_FALLBACK, tmp));
2441}
2442
2443#if !NO_COMPOSITE_SPANS
2444fastcall static void
2445gen8_render_composite_spans_box(struct sna *sna,
2446				const struct sna_composite_spans_op *op,
2447				const BoxRec *box, float opacity)
2448{
2449	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
2450	     __FUNCTION__,
2451	     op->base.src.offset[0], op->base.src.offset[1],
2452	     opacity,
2453	     op->base.dst.x, op->base.dst.y,
2454	     box->x1, box->y1,
2455	     box->x2 - box->x1,
2456	     box->y2 - box->y1));
2457
2458	gen8_get_rectangles(sna, &op->base, 1, gen8_emit_composite_state);
2459	op->prim_emit(sna, op, box, opacity);
2460}
2461
2462static void
2463gen8_render_composite_spans_boxes(struct sna *sna,
2464				  const struct sna_composite_spans_op *op,
2465				  const BoxRec *box, int nbox,
2466				  float opacity)
2467{
2468	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
2469	     __FUNCTION__, nbox,
2470	     op->base.src.offset[0], op->base.src.offset[1],
2471	     opacity,
2472	     op->base.dst.x, op->base.dst.y));
2473
2474	do {
2475		int nbox_this_time;
2476
2477		nbox_this_time = gen8_get_rectangles(sna, &op->base, nbox,
2478						     gen8_emit_composite_state);
2479		nbox -= nbox_this_time;
2480
2481		do {
2482			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
2483			     box->x1, box->y1,
2484			     box->x2 - box->x1,
2485			     box->y2 - box->y1));
2486
2487			op->prim_emit(sna, op, box++, opacity);
2488		} while (--nbox_this_time);
2489	} while (nbox);
2490}
2491
2492fastcall static void
2493gen8_render_composite_spans_boxes__thread(struct sna *sna,
2494					  const struct sna_composite_spans_op *op,
2495					  const struct sna_opacity_box *box,
2496					  int nbox)
2497{
2498	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
2499	     __FUNCTION__, nbox,
2500	     op->base.src.offset[0], op->base.src.offset[1],
2501	     op->base.dst.x, op->base.dst.y));
2502
2503	sna_vertex_lock(&sna->render);
2504	do {
2505		int nbox_this_time;
2506		float *v;
2507
2508		nbox_this_time = gen8_get_rectangles(sna, &op->base, nbox,
2509						     gen8_emit_composite_state);
2510		assert(nbox_this_time);
2511		nbox -= nbox_this_time;
2512
2513		v = sna->render.vertices + sna->render.vertex_used;
2514		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
2515
2516		sna_vertex_acquire__locked(&sna->render);
2517		sna_vertex_unlock(&sna->render);
2518
2519		op->emit_boxes(op, box, nbox_this_time, v);
2520		box += nbox_this_time;
2521
2522		sna_vertex_lock(&sna->render);
2523		sna_vertex_release__locked(&sna->render);
2524	} while (nbox);
2525	sna_vertex_unlock(&sna->render);
2526}
2527
2528fastcall static void
2529gen8_render_composite_spans_done(struct sna *sna,
2530				 const struct sna_composite_spans_op *op)
2531{
2532	if (sna->render.vertex_offset)
2533		gen8_vertex_flush(sna);
2534
2535	DBG(("%s()\n", __FUNCTION__));
2536
2537	if (op->base.src.bo)
2538		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2539
2540	sna_render_composite_redirect_done(sna, &op->base);
2541}
2542
2543static bool
2544gen8_check_composite_spans(struct sna *sna,
2545			   uint8_t op, PicturePtr src, PicturePtr dst,
2546			   int16_t width, int16_t height, unsigned flags)
2547{
2548	if (op >= ARRAY_SIZE(gen8_blend_op))
2549		return false;
2550
2551	if (gen8_composite_fallback(sna, src, NULL, dst))
2552		return false;
2553
2554	if (need_tiling(sna, width, height) &&
2555	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
2556		DBG(("%s: fallback, tiled operation not on GPU\n",
2557		     __FUNCTION__));
2558		return false;
2559	}
2560
2561	return true;
2562}
2563
2564static bool
2565gen8_render_composite_spans(struct sna *sna,
2566			    uint8_t op,
2567			    PicturePtr src,
2568			    PicturePtr dst,
2569			    int16_t src_x,  int16_t src_y,
2570			    int16_t dst_x,  int16_t dst_y,
2571			    int16_t width,  int16_t height,
2572			    unsigned flags,
2573			    struct sna_composite_spans_op *tmp)
2574{
2575	DBG(("%s: %dx%d with flags=%x, current mode=%d\n", __FUNCTION__,
2576	     width, height, flags, sna->kgem.ring));
2577
2578	assert(gen8_check_composite_spans(sna, op, src, dst, width, height, flags));
2579
2580	if (need_tiling(sna, width, height)) {
2581		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
2582		     __FUNCTION__, width, height));
2583		return sna_tiling_composite_spans(op, src, dst,
2584						  src_x, src_y, dst_x, dst_y,
2585						  width, height, flags, tmp);
2586	}
2587
2588	tmp->base.op = op;
2589	if (!gen8_composite_set_target(sna, &tmp->base, dst,
2590				       dst_x, dst_y, width, height, true))
2591		return false;
2592
2593	switch (gen8_composite_picture(sna, src, &tmp->base.src,
2594				       src_x, src_y,
2595				       width, height,
2596				       dst_x, dst_y,
2597				       dst->polyMode == PolyModePrecise)) {
2598	case -1:
2599		goto cleanup_dst;
2600	case 0:
2601		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
2602			goto cleanup_dst;
2603		/* fall through to fixup */
2604	case 1:
2605		if (!gen8_composite_channel_convert(&tmp->base.src))
2606			goto cleanup_src;
2607		break;
2608	}
2609	tmp->base.mask.bo = NULL;
2610
2611	tmp->base.is_affine = tmp->base.src.is_affine;
2612	tmp->base.need_magic_ca_pass = false;
2613
2614	tmp->base.u.gen8.flags =
2615		GEN8_SET_FLAGS(SAMPLER_OFFSET(tmp->base.src.filter,
2616					      tmp->base.src.repeat,
2617					      SAMPLER_FILTER_NEAREST,
2618					      SAMPLER_EXTEND_PAD),
2619			       gen8_get_blend(tmp->base.op, false, tmp->base.dst.format),
2620			       GEN8_WM_KERNEL_OPACITY | !tmp->base.is_affine,
2621			       gen4_choose_spans_emitter(sna, tmp));
2622
2623	tmp->box   = gen8_render_composite_spans_box;
2624	tmp->boxes = gen8_render_composite_spans_boxes;
2625	if (tmp->emit_boxes)
2626		tmp->thread_boxes = gen8_render_composite_spans_boxes__thread;
2627	tmp->done  = gen8_render_composite_spans_done;
2628
2629	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->base.dst.bo);
2630	if (!kgem_check_bo(&sna->kgem,
2631			   tmp->base.dst.bo, tmp->base.src.bo,
2632			   NULL)) {
2633		kgem_submit(&sna->kgem);
2634		if (!kgem_check_bo(&sna->kgem,
2635				   tmp->base.dst.bo, tmp->base.src.bo,
2636				   NULL))
2637			goto cleanup_src;
2638		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
2639	}
2640
2641	gen8_align_vertex(sna, &tmp->base);
2642	gen8_emit_composite_state(sna, &tmp->base);
2643	return true;
2644
2645cleanup_src:
2646	if (tmp->base.src.bo)
2647		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
2648cleanup_dst:
2649	if (tmp->base.redirect.real_bo)
2650		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
2651	return false;
2652}
2653#endif
2654
2655static void
2656gen8_emit_copy_state(struct sna *sna,
2657		     const struct sna_composite_op *op)
2658{
2659	uint32_t *binding_table;
2660	uint16_t offset, dirty;
2661
2662	gen8_get_batch(sna, op);
2663
2664	binding_table = gen8_composite_get_binding_table(sna, &offset);
2665
2666	dirty = kgem_bo_is_dirty(op->dst.bo);
2667
2668	binding_table[0] =
2669		gen8_bind_bo(sna,
2670			     op->dst.bo, op->dst.width, op->dst.height,
2671			     gen8_get_dest_format(op->dst.format),
2672			     true);
2673	binding_table[1] =
2674		gen8_bind_bo(sna,
2675			     op->src.bo, op->src.width, op->src.height,
2676			     op->src.card_format,
2677			     false);
2678
2679	if (sna->kgem.surface == offset &&
2680	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen8.surface_table) == *(uint64_t*)binding_table) {
2681		sna->kgem.surface += SURFACE_DW;
2682		offset = sna->render_state.gen8.surface_table;
2683	}
2684
2685	if (sna->kgem.batch[sna->render_state.gen8.surface_table] == binding_table[0])
2686		dirty = 0;
2687
2688	assert(!GEN8_READS_DST(op->u.gen8.flags));
2689	gen8_emit_state(sna, op, offset | dirty);
2690}
2691
2692static inline bool
2693prefer_blt_copy(struct sna *sna,
2694		struct kgem_bo *src_bo,
2695		struct kgem_bo *dst_bo,
2696		unsigned flags)
2697{
2698	if (sna->kgem.mode == KGEM_BLT)
2699		return true;
2700
2701	assert((flags & COPY_SYNC) == 0);
2702
2703	if (src_bo == dst_bo && can_switch_to_blt(sna, dst_bo, flags))
2704		return true;
2705
2706	if (untiled_tlb_miss(src_bo) ||
2707	    untiled_tlb_miss(dst_bo))
2708		return true;
2709
2710	if (force_blt_ring(sna))
2711		return true;
2712
2713	if (kgem_bo_is_render(dst_bo) ||
2714	    kgem_bo_is_render(src_bo))
2715		return false;
2716
2717	if (prefer_render_ring(sna, dst_bo))
2718		return false;
2719
2720	if (!prefer_blt_ring(sna, dst_bo, flags))
2721		return false;
2722
2723	return prefer_blt_bo(sna, src_bo) || prefer_blt_bo(sna, dst_bo);
2724}
2725
2726static bool
2727gen8_render_copy_boxes(struct sna *sna, uint8_t alu,
2728		       const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
2729		       const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
2730		       const BoxRec *box, int n, unsigned flags)
2731{
2732	struct sna_composite_op tmp;
2733	BoxRec extents;
2734
2735	DBG(("%s (%d, %d)->(%d, %d) x %d, alu=%x, flags=%x, self-copy=%d, overlaps? %d\n",
2736	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n, alu, flags,
2737	     src_bo == dst_bo,
2738	     overlaps(sna,
2739		      src_bo, src_dx, src_dy,
2740		      dst_bo, dst_dx, dst_dy,
2741		      box, n, flags, &extents)));
2742
2743	if (prefer_blt_copy(sna, src_bo, dst_bo, flags) &&
2744	    sna_blt_compare_depth(src, dst) &&
2745	    sna_blt_copy_boxes(sna, alu,
2746			       src_bo, src_dx, src_dy,
2747			       dst_bo, dst_dx, dst_dy,
2748			       dst->bitsPerPixel,
2749			       box, n))
2750		return true;
2751
2752	if (!(alu == GXcopy || alu == GXclear) ||
2753	    unaligned(src_bo, src->bitsPerPixel) ||
2754	    unaligned(dst_bo, dst->bitsPerPixel)) {
2755fallback_blt:
2756		DBG(("%s: fallback blt\n", __FUNCTION__));
2757		if (!sna_blt_compare_depth(src, dst))
2758			return false;
2759
2760		return sna_blt_copy_boxes_fallback(sna, alu,
2761						   src, src_bo, src_dx, src_dy,
2762						   dst, dst_bo, dst_dx, dst_dy,
2763						   box, n);
2764	}
2765
2766	if (overlaps(sna,
2767		     src_bo, src_dx, src_dy,
2768		     dst_bo, dst_dx, dst_dy,
2769		     box, n, flags,
2770		     &extents)) {
2771		bool big = too_large(extents.x2-extents.x1, extents.y2-extents.y1);
2772
2773		if ((big || can_switch_to_blt(sna, dst_bo, flags)) &&
2774		    sna_blt_copy_boxes(sna, alu,
2775				       src_bo, src_dx, src_dy,
2776				       dst_bo, dst_dx, dst_dy,
2777				       dst->bitsPerPixel,
2778				       box, n))
2779			return true;
2780
2781		if (big)
2782			goto fallback_blt;
2783
2784		assert(src_bo == dst_bo);
2785		assert(src->depth == dst->depth);
2786		assert(src->width == dst->width);
2787		assert(src->height == dst->height);
2788		return sna_render_copy_boxes__overlap(sna, alu,
2789						      src, src_bo,
2790						      src_dx, src_dy,
2791						      dst_dx, dst_dy,
2792						      box, n, &extents);
2793	}
2794
2795	if (dst->depth == src->depth) {
2796		tmp.dst.format = sna_render_format_for_depth(dst->depth);
2797		tmp.src.pict_format = tmp.dst.format;
2798	} else {
2799		tmp.dst.format = sna_format_for_depth(dst->depth);
2800		tmp.src.pict_format = sna_format_for_depth(src->depth);
2801	}
2802	if (!gen8_check_format(tmp.src.pict_format))
2803		goto fallback_blt;
2804
2805	tmp.dst.pixmap = (PixmapPtr)dst;
2806	tmp.dst.width  = dst->width;
2807	tmp.dst.height = dst->height;
2808	tmp.dst.bo = dst_bo;
2809	tmp.dst.x = tmp.dst.y = 0;
2810	tmp.damage = NULL;
2811
2812	sna_render_composite_redirect_init(&tmp);
2813	if (too_large(tmp.dst.width, tmp.dst.height)) {
2814		int i;
2815
2816		extents = box[0];
2817		for (i = 1; i < n; i++) {
2818			if (box[i].x1 < extents.x1)
2819				extents.x1 = box[i].x1;
2820			if (box[i].y1 < extents.y1)
2821				extents.y1 = box[i].y1;
2822
2823			if (box[i].x2 > extents.x2)
2824				extents.x2 = box[i].x2;
2825			if (box[i].y2 > extents.y2)
2826				extents.y2 = box[i].y2;
2827		}
2828
2829		if (!sna_render_composite_redirect(sna, &tmp,
2830						   extents.x1 + dst_dx,
2831						   extents.y1 + dst_dy,
2832						   extents.x2 - extents.x1,
2833						   extents.y2 - extents.y1,
2834						   n > 1))
2835			goto fallback_tiled;
2836	}
2837
2838	tmp.src.card_format = gen8_get_card_format(tmp.src.pict_format);
2839	if (too_large(src->width, src->height)) {
2840		int i;
2841
2842		extents = box[0];
2843		for (i = 1; i < n; i++) {
2844			if (box[i].x1 < extents.x1)
2845				extents.x1 = box[i].x1;
2846			if (box[i].y1 < extents.y1)
2847				extents.y1 = box[i].y1;
2848
2849			if (box[i].x2 > extents.x2)
2850				extents.x2 = box[i].x2;
2851			if (box[i].y2 > extents.y2)
2852				extents.y2 = box[i].y2;
2853		}
2854
2855		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
2856					       extents.x1 + src_dx,
2857					       extents.y1 + src_dy,
2858					       extents.x2 - extents.x1,
2859					       extents.y2 - extents.y1))
2860			goto fallback_tiled_dst;
2861	} else {
2862		tmp.src.bo = src_bo;
2863		tmp.src.width  = src->width;
2864		tmp.src.height = src->height;
2865		tmp.src.offset[0] = tmp.src.offset[1] = 0;
2866	}
2867
2868	tmp.mask.bo = NULL;
2869
2870	tmp.floats_per_vertex = 2;
2871	tmp.floats_per_rect = 6;
2872	tmp.need_magic_ca_pass = 0;
2873
2874	tmp.u.gen8.flags = COPY_FLAGS(alu);
2875
2876	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
2877	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
2878		kgem_submit(&sna->kgem);
2879		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
2880			if (tmp.src.bo != src_bo)
2881				kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2882			if (tmp.redirect.real_bo)
2883				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2884			goto fallback_blt;
2885		}
2886		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
2887	}
2888
2889	src_dx += tmp.src.offset[0];
2890	src_dy += tmp.src.offset[1];
2891
2892	dst_dx += tmp.dst.x;
2893	dst_dy += tmp.dst.y;
2894
2895	tmp.dst.x = tmp.dst.y = 0;
2896
2897	gen8_align_vertex(sna, &tmp);
2898	gen8_emit_copy_state(sna, &tmp);
2899
2900	do {
2901		int16_t *v;
2902		int n_this_time;
2903
2904		n_this_time = gen8_get_rectangles(sna, &tmp, n,
2905						  gen8_emit_copy_state);
2906		n -= n_this_time;
2907
2908		v = (int16_t *)(sna->render.vertices + sna->render.vertex_used);
2909		sna->render.vertex_used += 6 * n_this_time;
2910		assert(sna->render.vertex_used <= sna->render.vertex_size);
2911		do {
2912
2913			DBG(("	(%d, %d) -> (%d, %d) + (%d, %d)\n",
2914			     box->x1 + src_dx, box->y1 + src_dy,
2915			     box->x1 + dst_dx, box->y1 + dst_dy,
2916			     box->x2 - box->x1, box->y2 - box->y1));
2917			v[0] = box->x2 + dst_dx;
2918			v[2] = box->x2 + src_dx;
2919			v[1]  = v[5] = box->y2 + dst_dy;
2920			v[3]  = v[7] = box->y2 + src_dy;
2921			v[8]  = v[4] = box->x1 + dst_dx;
2922			v[10] = v[6] = box->x1 + src_dx;
2923			v[9]  = box->y1 + dst_dy;
2924			v[11] = box->y1 + src_dy;
2925			v += 12; box++;
2926		} while (--n_this_time);
2927	} while (n);
2928
2929	gen8_vertex_flush(sna);
2930	sna_render_composite_redirect_done(sna, &tmp);
2931	if (tmp.src.bo != src_bo)
2932		kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2933	return true;
2934
2935fallback_tiled_dst:
2936	if (tmp.redirect.real_bo)
2937		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2938fallback_tiled:
2939	DBG(("%s: fallback tiled\n", __FUNCTION__));
2940	if (sna_blt_compare_depth(src, dst) &&
2941	    sna_blt_copy_boxes(sna, alu,
2942			       src_bo, src_dx, src_dy,
2943			       dst_bo, dst_dx, dst_dy,
2944			       dst->bitsPerPixel,
2945			       box, n))
2946		return true;
2947
2948	return sna_tiling_copy_boxes(sna, alu,
2949				     src, src_bo, src_dx, src_dy,
2950				     dst, dst_bo, dst_dx, dst_dy,
2951				     box, n);
2952}
2953
2954static void
2955gen8_render_copy_blt(struct sna *sna,
2956		     const struct sna_copy_op *op,
2957		     int16_t sx, int16_t sy,
2958		     int16_t w,  int16_t h,
2959		     int16_t dx, int16_t dy)
2960{
2961	int16_t *v;
2962
2963	gen8_get_rectangles(sna, &op->base, 1, gen8_emit_copy_state);
2964
2965	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
2966	sna->render.vertex_used += 6;
2967	assert(sna->render.vertex_used <= sna->render.vertex_size);
2968
2969	v[0]  = dx+w; v[1]  = dy+h;
2970	v[2]  = sx+w; v[3]  = sy+h;
2971	v[4]  = dx;   v[5]  = dy+h;
2972	v[6]  = sx;   v[7]  = sy+h;
2973	v[8]  = dx;   v[9]  = dy;
2974	v[10] = sx;   v[11] = sy;
2975}
2976
2977static void
2978gen8_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
2979{
2980	if (sna->render.vertex_offset)
2981		gen8_vertex_flush(sna);
2982}
2983
2984static bool
2985gen8_render_copy(struct sna *sna, uint8_t alu,
2986		 PixmapPtr src, struct kgem_bo *src_bo,
2987		 PixmapPtr dst, struct kgem_bo *dst_bo,
2988		 struct sna_copy_op *op)
2989{
2990	DBG(("%s (alu=%d, src=(%dx%d), dst=(%dx%d))\n",
2991	     __FUNCTION__, alu,
2992	     src->drawable.width, src->drawable.height,
2993	     dst->drawable.width, dst->drawable.height));
2994
2995	if (prefer_blt_copy(sna, src_bo, dst_bo, 0) &&
2996	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2997	    sna_blt_copy(sna, alu,
2998			 src_bo, dst_bo,
2999			 dst->drawable.bitsPerPixel,
3000			 op))
3001		return true;
3002
3003	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
3004	    too_large(src->drawable.width, src->drawable.height) ||
3005	    too_large(dst->drawable.width, dst->drawable.height) ||
3006	    unaligned(src_bo, src->drawable.bitsPerPixel) ||
3007	    unaligned(dst_bo, dst->drawable.bitsPerPixel)) {
3008fallback:
3009		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
3010			return false;
3011
3012		return sna_blt_copy(sna, alu, src_bo, dst_bo,
3013				    dst->drawable.bitsPerPixel,
3014				    op);
3015	}
3016
3017	if (dst->drawable.depth == src->drawable.depth) {
3018		op->base.dst.format = sna_render_format_for_depth(dst->drawable.depth);
3019		op->base.src.pict_format = op->base.dst.format;
3020	} else {
3021		op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
3022		op->base.src.pict_format = sna_format_for_depth(src->drawable.depth);
3023	}
3024	if (!gen8_check_format(op->base.src.pict_format))
3025		goto fallback;
3026
3027	op->base.dst.pixmap = dst;
3028	op->base.dst.width  = dst->drawable.width;
3029	op->base.dst.height = dst->drawable.height;
3030	op->base.dst.bo = dst_bo;
3031
3032	op->base.src.bo = src_bo;
3033	op->base.src.card_format =
3034		gen8_get_card_format(op->base.src.pict_format);
3035	op->base.src.width  = src->drawable.width;
3036	op->base.src.height = src->drawable.height;
3037
3038	op->base.mask.bo = NULL;
3039
3040	op->base.floats_per_vertex = 2;
3041	op->base.floats_per_rect = 6;
3042
3043	op->base.u.gen8.flags = COPY_FLAGS(alu);
3044
3045	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
3046	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
3047		kgem_submit(&sna->kgem);
3048		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
3049			goto fallback;
3050		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3051	}
3052
3053	gen8_align_vertex(sna, &op->base);
3054	gen8_emit_copy_state(sna, &op->base);
3055
3056	op->blt  = gen8_render_copy_blt;
3057	op->done = gen8_render_copy_done;
3058	return true;
3059}
3060
3061static void
3062gen8_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
3063{
3064	uint32_t *binding_table;
3065	uint16_t offset, dirty;
3066
3067	/* XXX Render Target Fast Clear
3068	 * Set RTFC Enable in PS and render a rectangle.
3069	 * Limited to a clearing the full MSC surface only with a
3070	 * specific kernel.
3071	 */
3072
3073	gen8_get_batch(sna, op);
3074
3075	binding_table = gen8_composite_get_binding_table(sna, &offset);
3076
3077	dirty = kgem_bo_is_dirty(op->dst.bo);
3078
3079	binding_table[0] =
3080		gen8_bind_bo(sna,
3081			     op->dst.bo, op->dst.width, op->dst.height,
3082			     gen8_get_dest_format(op->dst.format),
3083			     true);
3084	binding_table[1] =
3085		gen8_bind_bo(sna,
3086			     op->src.bo, 1, 1,
3087			     SURFACEFORMAT_B8G8R8A8_UNORM,
3088			     false);
3089
3090	if (sna->kgem.surface == offset &&
3091	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen8.surface_table) == *(uint64_t*)binding_table) {
3092		sna->kgem.surface += SURFACE_DW;
3093		offset = sna->render_state.gen8.surface_table;
3094	}
3095
3096	if (sna->kgem.batch[sna->render_state.gen8.surface_table] == binding_table[0])
3097		dirty = 0;
3098
3099	gen8_emit_state(sna, op, offset | dirty);
3100}
3101
3102static bool
3103gen8_render_fill_boxes(struct sna *sna,
3104		       CARD8 op,
3105		       PictFormat format,
3106		       const xRenderColor *color,
3107		       const DrawableRec *dst, struct kgem_bo *dst_bo,
3108		       const BoxRec *box, int n)
3109{
3110	struct sna_composite_op tmp;
3111	uint32_t pixel;
3112
3113	DBG(("%s (op=%d, color=(%04x, %04x, %04x, %04x) [%08x])\n",
3114	     __FUNCTION__, op,
3115	     color->red, color->green, color->blue, color->alpha, (int)format));
3116
3117	if (op >= ARRAY_SIZE(gen8_blend_op)) {
3118		DBG(("%s: fallback due to unhandled blend op: %d\n",
3119		     __FUNCTION__, op));
3120		return false;
3121	}
3122
3123	if (prefer_blt_fill(sna, dst_bo, FILL_BOXES) ||
3124	    !gen8_check_dst_format(format) ||
3125	    unaligned(dst_bo, PICT_FORMAT_BPP(format))) {
3126		uint8_t alu = GXinvalid;
3127
3128		if (op <= PictOpSrc) {
3129			pixel = 0;
3130			if (op == PictOpClear)
3131				alu = GXclear;
3132			else if (sna_get_pixel_from_rgba(&pixel,
3133							 color->red,
3134							 color->green,
3135							 color->blue,
3136							 color->alpha,
3137							 format))
3138				alu = GXcopy;
3139		}
3140
3141		if (alu != GXinvalid &&
3142		    sna_blt_fill_boxes(sna, alu,
3143				       dst_bo, dst->bitsPerPixel,
3144				       pixel, box, n))
3145			return true;
3146
3147		if (!gen8_check_dst_format(format))
3148			return false;
3149	}
3150
3151	if (op == PictOpClear) {
3152		pixel = 0;
3153		op = PictOpSrc;
3154	} else if (!sna_get_pixel_from_rgba(&pixel,
3155					    color->red,
3156					    color->green,
3157					    color->blue,
3158					    color->alpha,
3159					    PICT_a8r8g8b8))
3160		return false;
3161
3162	DBG(("%s(%08x x %d [(%d, %d), (%d, %d) ...])\n",
3163	     __FUNCTION__, pixel, n,
3164	     box[0].x1, box[0].y1, box[0].x2, box[0].y2));
3165
3166	tmp.dst.pixmap = (PixmapPtr)dst;
3167	tmp.dst.width  = dst->width;
3168	tmp.dst.height = dst->height;
3169	tmp.dst.format = format;
3170	tmp.dst.bo = dst_bo;
3171	tmp.dst.x = tmp.dst.y = 0;
3172	tmp.damage = NULL;
3173
3174	sna_render_composite_redirect_init(&tmp);
3175	if (too_large(dst->width, dst->height)) {
3176		BoxRec extents;
3177
3178		boxes_extents(box, n, &extents);
3179		if (!sna_render_composite_redirect(sna, &tmp,
3180						   extents.x1, extents.y1,
3181						   extents.x2 - extents.x1,
3182						   extents.y2 - extents.y1,
3183						   n > 1))
3184			return sna_tiling_fill_boxes(sna, op, format, color,
3185						     dst, dst_bo, box, n);
3186	}
3187
3188	tmp.src.bo = sna_render_get_solid(sna, pixel);
3189	tmp.mask.bo = NULL;
3190
3191	tmp.floats_per_vertex = 2;
3192	tmp.floats_per_rect = 6;
3193	tmp.need_magic_ca_pass = false;
3194
3195	tmp.u.gen8.flags = FILL_FLAGS(op, format);
3196
3197	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
3198	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3199		kgem_submit(&sna->kgem);
3200		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3201			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3202			tmp.src.bo = NULL;
3203
3204			if (tmp.redirect.real_bo) {
3205				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
3206				tmp.redirect.real_bo = NULL;
3207			}
3208
3209			return false;
3210		}
3211		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3212	}
3213
3214	gen8_align_vertex(sna, &tmp);
3215	gen8_emit_fill_state(sna, &tmp);
3216
3217	do {
3218		int n_this_time;
3219		int16_t *v;
3220
3221		n_this_time = gen8_get_rectangles(sna, &tmp, n,
3222						  gen8_emit_fill_state);
3223		n -= n_this_time;
3224
3225		v = (int16_t *)(sna->render.vertices + sna->render.vertex_used);
3226		sna->render.vertex_used += 6 * n_this_time;
3227		assert(sna->render.vertex_used <= sna->render.vertex_size);
3228		do {
3229			DBG(("	(%d, %d), (%d, %d)\n",
3230			     box->x1, box->y1, box->x2, box->y2));
3231
3232			v[0] = box->x2;
3233			v[5] = v[1] = box->y2;
3234			v[8] = v[4] = box->x1;
3235			v[9] = box->y1;
3236			v[2] = v[3]  = v[7]  = 1;
3237			v[6] = v[10] = v[11] = 0;
3238			v += 12; box++;
3239		} while (--n_this_time);
3240	} while (n);
3241
3242	gen8_vertex_flush(sna);
3243	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3244	sna_render_composite_redirect_done(sna, &tmp);
3245	return true;
3246}
3247
3248static void
3249gen8_render_fill_op_blt(struct sna *sna,
3250			const struct sna_fill_op *op,
3251			int16_t x, int16_t y, int16_t w, int16_t h)
3252{
3253	int16_t *v;
3254
3255	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
3256
3257	gen8_get_rectangles(sna, &op->base, 1, gen8_emit_fill_state);
3258
3259	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3260	sna->render.vertex_used += 6;
3261	assert(sna->render.vertex_used <= sna->render.vertex_size);
3262
3263	v[0] = x+w;
3264	v[4] = v[8] = x;
3265	v[1] = v[5] = y+h;
3266	v[9] = y;
3267
3268	v[2] = v[3]  = v[7]  = 1;
3269	v[6] = v[10] = v[11] = 0;
3270}
3271
3272fastcall static void
3273gen8_render_fill_op_box(struct sna *sna,
3274			const struct sna_fill_op *op,
3275			const BoxRec *box)
3276{
3277	int16_t *v;
3278
3279	DBG(("%s: (%d, %d),(%d, %d)\n", __FUNCTION__,
3280	     box->x1, box->y1, box->x2, box->y2));
3281
3282	gen8_get_rectangles(sna, &op->base, 1, gen8_emit_fill_state);
3283
3284	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3285	sna->render.vertex_used += 6;
3286	assert(sna->render.vertex_used <= sna->render.vertex_size);
3287
3288	v[0] = box->x2;
3289	v[8] = v[4] = box->x1;
3290	v[5] = v[1] = box->y2;
3291	v[9] = box->y1;
3292
3293	v[7] = v[2]  = v[3]  = 1;
3294	v[6] = v[10] = v[11] = 0;
3295}
3296
3297fastcall static void
3298gen8_render_fill_op_boxes(struct sna *sna,
3299			  const struct sna_fill_op *op,
3300			  const BoxRec *box,
3301			  int nbox)
3302{
3303	DBG(("%s: (%d, %d),(%d, %d)... x %d\n", __FUNCTION__,
3304	     box->x1, box->y1, box->x2, box->y2, nbox));
3305
3306	do {
3307		int nbox_this_time;
3308		int16_t *v;
3309
3310		nbox_this_time = gen8_get_rectangles(sna, &op->base, nbox,
3311						     gen8_emit_fill_state);
3312		nbox -= nbox_this_time;
3313
3314		v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3315		sna->render.vertex_used += 6 * nbox_this_time;
3316		assert(sna->render.vertex_used <= sna->render.vertex_size);
3317
3318		do {
3319			v[0] = box->x2;
3320			v[8] = v[4] = box->x1;
3321			v[5] = v[1] = box->y2;
3322			v[9] = box->y1;
3323			v[7] = v[2]  = v[3]  = 1;
3324			v[6] = v[10] = v[11] = 0;
3325			box++; v += 12;
3326		} while (--nbox_this_time);
3327	} while (nbox);
3328}
3329
3330static void
3331gen8_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
3332{
3333	if (sna->render.vertex_offset)
3334		gen8_vertex_flush(sna);
3335	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
3336}
3337
3338static bool
3339gen8_render_fill(struct sna *sna, uint8_t alu,
3340		 PixmapPtr dst, struct kgem_bo *dst_bo,
3341		 uint32_t color, unsigned flags,
3342		 struct sna_fill_op *op)
3343{
3344	DBG(("%s: (alu=%d, color=%x)\n", __FUNCTION__, alu, color));
3345
3346	if (prefer_blt_fill(sna, dst_bo, flags) &&
3347	    sna_blt_fill(sna, alu,
3348			 dst_bo, dst->drawable.bitsPerPixel,
3349			 color,
3350			 op))
3351		return true;
3352
3353	if (!(alu == GXcopy || alu == GXclear) ||
3354	    too_large(dst->drawable.width, dst->drawable.height) ||
3355	    unaligned(dst_bo, dst->drawable.bitsPerPixel))
3356		return sna_blt_fill(sna, alu,
3357				    dst_bo, dst->drawable.bitsPerPixel,
3358				    color,
3359				    op);
3360
3361	if (alu == GXclear)
3362		color = 0;
3363
3364	op->base.dst.pixmap = dst;
3365	op->base.dst.width  = dst->drawable.width;
3366	op->base.dst.height = dst->drawable.height;
3367	op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
3368	op->base.dst.bo = dst_bo;
3369	op->base.dst.x = op->base.dst.y = 0;
3370
3371	op->base.src.bo =
3372		sna_render_get_solid(sna,
3373				     sna_rgba_for_color(color,
3374							dst->drawable.depth));
3375	op->base.mask.bo = NULL;
3376
3377	op->base.need_magic_ca_pass = false;
3378	op->base.floats_per_vertex = 2;
3379	op->base.floats_per_rect = 6;
3380
3381	op->base.u.gen8.flags = FILL_FLAGS_NOBLEND;
3382
3383	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
3384	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3385		kgem_submit(&sna->kgem);
3386		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3387			kgem_bo_destroy(&sna->kgem, op->base.src.bo);
3388			return false;
3389		}
3390
3391		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3392	}
3393
3394	gen8_align_vertex(sna, &op->base);
3395	gen8_emit_fill_state(sna, &op->base);
3396
3397	op->blt   = gen8_render_fill_op_blt;
3398	op->box   = gen8_render_fill_op_box;
3399	op->boxes = gen8_render_fill_op_boxes;
3400	op->points = NULL;
3401	op->done  = gen8_render_fill_op_done;
3402	return true;
3403}
3404
3405static bool
3406gen8_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3407			     uint32_t color,
3408			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
3409			     uint8_t alu)
3410{
3411	BoxRec box;
3412
3413	box.x1 = x1;
3414	box.y1 = y1;
3415	box.x2 = x2;
3416	box.y2 = y2;
3417
3418	return sna_blt_fill_boxes(sna, alu,
3419				  bo, dst->drawable.bitsPerPixel,
3420				  color, &box, 1);
3421}
3422
3423static bool
3424gen8_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3425		     uint32_t color,
3426		     int16_t x1, int16_t y1,
3427		     int16_t x2, int16_t y2,
3428		     uint8_t alu)
3429{
3430	struct sna_composite_op tmp;
3431	int16_t *v;
3432
3433	/* Prefer to use the BLT if already engaged */
3434	if (prefer_blt_fill(sna, bo, FILL_BOXES) &&
3435	    gen8_render_fill_one_try_blt(sna, dst, bo, color,
3436					 x1, y1, x2, y2, alu))
3437		return true;
3438
3439	/* Must use the BLT if we can't RENDER... */
3440	if (!(alu == GXcopy || alu == GXclear) ||
3441	    too_large(dst->drawable.width, dst->drawable.height) ||
3442	    unaligned(bo, dst->drawable.bitsPerPixel))
3443		return gen8_render_fill_one_try_blt(sna, dst, bo, color,
3444						    x1, y1, x2, y2, alu);
3445
3446	if (alu == GXclear)
3447		color = 0;
3448
3449	tmp.dst.pixmap = dst;
3450	tmp.dst.width  = dst->drawable.width;
3451	tmp.dst.height = dst->drawable.height;
3452	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
3453	tmp.dst.bo = bo;
3454	tmp.dst.x = tmp.dst.y = 0;
3455
3456	tmp.src.bo =
3457		sna_render_get_solid(sna,
3458				     sna_rgba_for_color(color,
3459							dst->drawable.depth));
3460	tmp.mask.bo = NULL;
3461
3462	tmp.floats_per_vertex = 2;
3463	tmp.floats_per_rect = 6;
3464	tmp.need_magic_ca_pass = false;
3465
3466	tmp.u.gen8.flags = FILL_FLAGS_NOBLEND;
3467
3468	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
3469	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3470		kgem_submit(&sna->kgem);
3471		if (kgem_check_bo(&sna->kgem, bo, NULL)) {
3472			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3473			return false;
3474		}
3475		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3476	}
3477
3478	gen8_align_vertex(sna, &tmp);
3479	gen8_emit_fill_state(sna, &tmp);
3480
3481	gen8_get_rectangles(sna, &tmp, 1, gen8_emit_fill_state);
3482
3483	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
3484
3485	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3486	sna->render.vertex_used += 6;
3487	assert(sna->render.vertex_used <= sna->render.vertex_size);
3488
3489	v[0] = x2;
3490	v[8] = v[4] = x1;
3491	v[5] = v[1] = y2;
3492	v[9] = y1;
3493	v[7] = v[2]  = v[3]  = 1;
3494	v[6] = v[10] = v[11] = 0;
3495
3496	gen8_vertex_flush(sna);
3497	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3498
3499	return true;
3500}
3501
3502static bool
3503gen8_render_clear_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
3504{
3505	BoxRec box;
3506
3507	box.x1 = 0;
3508	box.y1 = 0;
3509	box.x2 = dst->drawable.width;
3510	box.y2 = dst->drawable.height;
3511
3512	return sna_blt_fill_boxes(sna, GXclear,
3513				  bo, dst->drawable.bitsPerPixel,
3514				  0, &box, 1);
3515}
3516
3517static bool
3518gen8_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
3519{
3520	struct sna_composite_op tmp;
3521	int16_t *v;
3522
3523	DBG(("%s: %dx%d\n",
3524	     __FUNCTION__,
3525	     dst->drawable.width,
3526	     dst->drawable.height));
3527
3528	/* Prefer to use the BLT if already engaged */
3529	if (sna->kgem.mode == KGEM_BLT &&
3530	    gen8_render_clear_try_blt(sna, dst, bo))
3531		return true;
3532
3533	/* Must use the BLT if we can't RENDER... */
3534	if (too_large(dst->drawable.width, dst->drawable.height) ||
3535	    unaligned(bo, dst->drawable.bitsPerPixel))
3536		return gen8_render_clear_try_blt(sna, dst, bo);
3537
3538	tmp.dst.pixmap = dst;
3539	tmp.dst.width  = dst->drawable.width;
3540	tmp.dst.height = dst->drawable.height;
3541	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
3542	tmp.dst.bo = bo;
3543	tmp.dst.x = tmp.dst.y = 0;
3544
3545	tmp.src.bo = sna_render_get_solid(sna, 0);
3546	tmp.mask.bo = NULL;
3547
3548	tmp.floats_per_vertex = 2;
3549	tmp.floats_per_rect = 6;
3550	tmp.need_magic_ca_pass = false;
3551
3552	tmp.u.gen8.flags = FILL_FLAGS_NOBLEND;
3553
3554	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
3555	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3556		kgem_submit(&sna->kgem);
3557		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3558			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3559			return false;
3560		}
3561		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3562	}
3563
3564	gen8_align_vertex(sna, &tmp);
3565	gen8_emit_fill_state(sna, &tmp);
3566
3567	gen8_get_rectangles(sna, &tmp, 1, gen8_emit_fill_state);
3568
3569	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3570	sna->render.vertex_used += 6;
3571	assert(sna->render.vertex_used <= sna->render.vertex_size);
3572
3573	v[0] = dst->drawable.width;
3574	v[5] = v[1] = dst->drawable.height;
3575	v[8] = v[4] = 0;
3576	v[9] = 0;
3577
3578	v[7] = v[2]  = v[3]  = 1;
3579	v[6] = v[10] = v[11] = 0;
3580
3581	gen8_vertex_flush(sna);
3582	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3583
3584	return true;
3585}
3586
3587#if !NO_VIDEO
3588static uint32_t gen8_bind_video_source(struct sna *sna,
3589				       struct kgem_bo *bo,
3590				       uint32_t delta,
3591				       int width,
3592				       int height,
3593				       int pitch,
3594				       uint32_t format)
3595{
3596	uint32_t *ss;
3597	int offset;
3598
3599	offset = sna->kgem.surface -= SURFACE_DW;
3600	ss = sna->kgem.batch + offset;
3601	ss[0] = (SURFACE_2D << SURFACE_TYPE_SHIFT |
3602		 gen8_tiling_bits(bo->tiling) |
3603		 format << SURFACE_FORMAT_SHIFT |
3604		 SURFACE_VALIGN_4 | SURFACE_HALIGN_4);
3605	ss[1] = 0;
3606	ss[2] = ((width - 1)  << SURFACE_WIDTH_SHIFT |
3607		 (height - 1) << SURFACE_HEIGHT_SHIFT);
3608	ss[3] = (pitch - 1) << SURFACE_PITCH_SHIFT;
3609	ss[4] = 0;
3610	ss[5] = 0;
3611	ss[6] = 0;
3612	ss[7] = SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
3613	*(uint64_t *)(ss+8) =
3614		kgem_add_reloc64(&sna->kgem, offset + 8, bo,
3615				 I915_GEM_DOMAIN_SAMPLER << 16,
3616				 delta);
3617	ss[10] = 0;
3618	ss[11] = 0;
3619	ss[12] = 0;
3620	ss[13] = 0;
3621	ss[14] = 0;
3622	ss[15] = 0;
3623
3624	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> sampler\n",
3625	     offset, bo->handle, ss[1],
3626	     format, width, height, bo->pitch, bo->tiling));
3627
3628	return offset * sizeof(uint32_t);
3629}
3630
3631static void gen8_emit_video_state(struct sna *sna,
3632				  const struct sna_composite_op *op)
3633{
3634	struct sna_video_frame *frame = op->priv;
3635	uint32_t src_surf_format;
3636	uint32_t src_surf_base[6];
3637	int src_width[6];
3638	int src_height[6];
3639	int src_pitch[6];
3640	uint32_t *binding_table;
3641	uint16_t offset;
3642	int n_src, n;
3643
3644	/* XXX VeBox, bicubic */
3645
3646	gen8_get_batch(sna, op);
3647
3648	src_surf_base[0] = 0;
3649	src_surf_base[1] = 0;
3650	src_surf_base[2] = frame->VBufOffset;
3651	src_surf_base[3] = frame->VBufOffset;
3652	src_surf_base[4] = frame->UBufOffset;
3653	src_surf_base[5] = frame->UBufOffset;
3654
3655	if (is_planar_fourcc(frame->id)) {
3656		src_surf_format = SURFACEFORMAT_R8_UNORM;
3657		src_width[1]  = src_width[0]  = frame->width;
3658		src_height[1] = src_height[0] = frame->height;
3659		src_pitch[1]  = src_pitch[0]  = frame->pitch[1];
3660		src_width[4]  = src_width[5]  = src_width[2]  = src_width[3] =
3661			frame->width / 2;
3662		src_height[4] = src_height[5] = src_height[2] = src_height[3] =
3663			frame->height / 2;
3664		src_pitch[4]  = src_pitch[5]  = src_pitch[2]  = src_pitch[3] =
3665			frame->pitch[0];
3666		n_src = 6;
3667	} else {
3668		if (frame->id == FOURCC_UYVY)
3669			src_surf_format = SURFACEFORMAT_YCRCB_SWAPY;
3670		else
3671			src_surf_format = SURFACEFORMAT_YCRCB_NORMAL;
3672
3673		src_width[0]  = frame->width;
3674		src_height[0] = frame->height;
3675		src_pitch[0]  = frame->pitch[0];
3676		n_src = 1;
3677	}
3678
3679	binding_table = gen8_composite_get_binding_table(sna, &offset);
3680
3681	binding_table[0] =
3682		gen8_bind_bo(sna,
3683			     op->dst.bo, op->dst.width, op->dst.height,
3684			     gen8_get_dest_format(op->dst.format),
3685			     true);
3686	for (n = 0; n < n_src; n++) {
3687		binding_table[1+n] =
3688			gen8_bind_video_source(sna,
3689					       frame->bo,
3690					       src_surf_base[n],
3691					       src_width[n],
3692					       src_height[n],
3693					       src_pitch[n],
3694					       src_surf_format);
3695	}
3696
3697	gen8_emit_state(sna, op, offset);
3698}
3699
3700static bool
3701gen8_render_video(struct sna *sna,
3702		  struct sna_video *video,
3703		  struct sna_video_frame *frame,
3704		  RegionPtr dstRegion,
3705		  PixmapPtr pixmap)
3706{
3707	struct sna_composite_op tmp;
3708	struct sna_pixmap *priv = sna_pixmap(pixmap);
3709	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
3710	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
3711	int src_width = frame->src.x2 - frame->src.x1;
3712	int src_height = frame->src.y2 - frame->src.y1;
3713	float src_offset_x, src_offset_y;
3714	float src_scale_x, src_scale_y;
3715	int nbox, pix_xoff, pix_yoff;
3716	unsigned filter;
3717	const BoxRec *box;
3718
3719	DBG(("%s: src=(%d, %d), dst=(%d, %d), %dx[(%d, %d), (%d, %d)...]\n",
3720	     __FUNCTION__,
3721	     src_width, src_height, dst_width, dst_height,
3722	     region_num_rects(dstRegion),
3723	     REGION_EXTENTS(NULL, dstRegion)->x1,
3724	     REGION_EXTENTS(NULL, dstRegion)->y1,
3725	     REGION_EXTENTS(NULL, dstRegion)->x2,
3726	     REGION_EXTENTS(NULL, dstRegion)->y2));
3727
3728	assert(priv->gpu_bo);
3729	assert(!too_large(pixmap->drawable.width, pixmap->drawable.height));
3730	assert(!unaligned(priv->gpu_bo, pixmap->drawable.bitsPerPixel));
3731
3732	memset(&tmp, 0, sizeof(tmp));
3733
3734	tmp.dst.pixmap = pixmap;
3735	tmp.dst.width  = pixmap->drawable.width;
3736	tmp.dst.height = pixmap->drawable.height;
3737	tmp.dst.format = sna_render_format_for_depth(pixmap->drawable.depth);
3738	tmp.dst.bo = priv->gpu_bo;
3739
3740	tmp.src.bo = frame->bo;
3741	tmp.mask.bo = NULL;
3742
3743	tmp.floats_per_vertex = 3;
3744	tmp.floats_per_rect = 9;
3745
3746	if (src_width == dst_width && src_height == dst_height)
3747		filter = SAMPLER_FILTER_NEAREST;
3748	else
3749		filter = SAMPLER_FILTER_BILINEAR;
3750
3751	tmp.u.gen8.flags =
3752		GEN8_SET_FLAGS(SAMPLER_OFFSET(filter, SAMPLER_EXTEND_PAD,
3753					      SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE),
3754			       NO_BLEND,
3755			       is_planar_fourcc(frame->id) ?
3756			       GEN8_WM_KERNEL_VIDEO_PLANAR :
3757			       GEN8_WM_KERNEL_VIDEO_PACKED,
3758			       2);
3759	tmp.priv = frame;
3760
3761	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
3762	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
3763		kgem_submit(&sna->kgem);
3764		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
3765			return false;
3766
3767		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3768	}
3769
3770	gen8_align_vertex(sna, &tmp);
3771	gen8_emit_video_state(sna, &tmp);
3772
3773	/* Set up the offset for translating from the given region (in screen
3774	 * coordinates) to the backing pixmap.
3775	 */
3776#ifdef COMPOSITE
3777	pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
3778	pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
3779#else
3780	pix_xoff = 0;
3781	pix_yoff = 0;
3782#endif
3783
3784	DBG(("%s: src=(%d, %d)x(%d, %d); frame=(%dx%d), dst=(%dx%d)\n",
3785	     __FUNCTION__,
3786	     frame->src.x1, frame->src.y1,
3787	     src_width, src_height,
3788	     dst_width, dst_height,
3789	     frame->width, frame->height));
3790
3791	src_scale_x = (float)src_width / dst_width / frame->width;
3792	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
3793
3794	src_scale_y = (float)src_height / dst_height / frame->height;
3795	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
3796
3797	DBG(("%s: scale=(%f, %f), offset=(%f, %f)\n",
3798	     __FUNCTION__,
3799	     src_scale_x, src_scale_y,
3800	     src_offset_x, src_offset_y));
3801
3802	box = region_rects(dstRegion);
3803	nbox = region_num_rects(dstRegion);
3804	while (nbox--) {
3805		BoxRec r;
3806
3807		DBG(("%s: dst=(%d, %d), (%d, %d) + (%d, %d); src=(%f, %f), (%f, %f)\n",
3808		     __FUNCTION__,
3809		     box->x1, box->y1,
3810		     box->x2, box->y2,
3811		     pix_xoff, pix_yoff,
3812		     box->x1 * src_scale_x + src_offset_x,
3813		     box->y1 * src_scale_y + src_offset_y,
3814		     box->x2 * src_scale_x + src_offset_x,
3815		     box->y2 * src_scale_y + src_offset_y));
3816
3817		r.x1 = box->x1 + pix_xoff;
3818		r.x2 = box->x2 + pix_xoff;
3819		r.y1 = box->y1 + pix_yoff;
3820		r.y2 = box->y2 + pix_yoff;
3821
3822		gen8_get_rectangles(sna, &tmp, 1, gen8_emit_video_state);
3823
3824		OUT_VERTEX(r.x2, r.y2);
3825		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
3826		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
3827
3828		OUT_VERTEX(r.x1, r.y2);
3829		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
3830		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
3831
3832		OUT_VERTEX(r.x1, r.y1);
3833		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
3834		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
3835
3836		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
3837			sna_damage_add_box(&priv->gpu_damage, &r);
3838			sna_damage_subtract_box(&priv->cpu_damage, &r);
3839		}
3840		box++;
3841	}
3842
3843	gen8_vertex_flush(sna);
3844	return true;
3845}
3846#endif
3847
3848static void gen8_render_flush(struct sna *sna)
3849{
3850	gen8_vertex_close(sna);
3851
3852	assert(sna->render.vb_id == 0);
3853	assert(sna->render.vertex_offset == 0);
3854}
3855
3856static void gen8_render_reset(struct sna *sna)
3857{
3858	sna->render_state.gen8.emit_flush = false;
3859	sna->render_state.gen8.needs_invariant = true;
3860	sna->render_state.gen8.ve_id = 3 << 2;
3861	sna->render_state.gen8.last_primitive = -1;
3862
3863	sna->render_state.gen8.num_sf_outputs = 0;
3864	sna->render_state.gen8.samplers = -1;
3865	sna->render_state.gen8.blend = -1;
3866	sna->render_state.gen8.kernel = -1;
3867	sna->render_state.gen8.drawrect_offset = -1;
3868	sna->render_state.gen8.drawrect_limit = -1;
3869	sna->render_state.gen8.surface_table = 0;
3870
3871	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
3872		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
3873		discard_vbo(sna);
3874	}
3875
3876	sna->render.vertex_offset = 0;
3877	sna->render.nvertex_reloc = 0;
3878	sna->render.vb_id = 0;
3879}
3880
3881static void gen8_render_fini(struct sna *sna)
3882{
3883	kgem_bo_destroy(&sna->kgem, sna->render_state.gen8.general_bo);
3884}
3885
3886static bool gen8_render_setup(struct sna *sna)
3887{
3888	struct gen8_render_state *state = &sna->render_state.gen8;
3889	struct sna_static_stream general;
3890	struct gen8_sampler_state *ss;
3891	int i, j, k, l, m;
3892	uint32_t devid;
3893
3894	devid = intel_get_device_id(sna->dev);
3895	if (devid & 0xf)
3896		state->gt = ((devid >> 4) & 0xf) + 1;
3897	DBG(("%s: gt=%d\n", __FUNCTION__, state->gt));
3898
3899	sna_static_stream_init(&general);
3900
3901	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
3902	 * dumps, you know it points to zero.
3903	 */
3904	null_create(&general);
3905
3906	for (m = 0; m < ARRAY_SIZE(wm_kernels); m++) {
3907		if (wm_kernels[m].size) {
3908			state->wm_kernel[m][1] =
3909				sna_static_stream_add(&general,
3910						      wm_kernels[m].data,
3911						      wm_kernels[m].size,
3912						      64);
3913		} else {
3914			if (USE_8_PIXEL_DISPATCH) {
3915				state->wm_kernel[m][0] =
3916					sna_static_stream_compile_wm(sna, &general,
3917								     wm_kernels[m].data, 8);
3918			}
3919
3920			if (USE_16_PIXEL_DISPATCH) {
3921				state->wm_kernel[m][1] =
3922					sna_static_stream_compile_wm(sna, &general,
3923								     wm_kernels[m].data, 16);
3924			}
3925
3926			if (USE_32_PIXEL_DISPATCH) {
3927				state->wm_kernel[m][2] =
3928					sna_static_stream_compile_wm(sna, &general,
3929								     wm_kernels[m].data, 32);
3930			}
3931		}
3932		assert(state->wm_kernel[m][0]|state->wm_kernel[m][1]|state->wm_kernel[m][2]);
3933	}
3934
3935	COMPILE_TIME_ASSERT(SAMPLER_OFFSET(FILTER_COUNT, EXTEND_COUNT, FILTER_COUNT, EXTEND_COUNT) <= 0x7ff);
3936	ss = sna_static_stream_map(&general,
3937				   2 * sizeof(*ss) *
3938				   (2 +
3939				    FILTER_COUNT * EXTEND_COUNT *
3940				    FILTER_COUNT * EXTEND_COUNT),
3941				   32);
3942	state->wm_state = sna_static_stream_offsetof(&general, ss);
3943	sampler_copy_init(ss); ss += 2;
3944	sampler_fill_init(ss); ss += 2;
3945	for (i = 0; i < FILTER_COUNT; i++) {
3946		for (j = 0; j < EXTEND_COUNT; j++) {
3947			for (k = 0; k < FILTER_COUNT; k++) {
3948				for (l = 0; l < EXTEND_COUNT; l++) {
3949					sampler_state_init(ss++, i, j);
3950					sampler_state_init(ss++, k, l);
3951				}
3952			}
3953		}
3954	}
3955
3956	state->cc_blend = gen8_create_blend_state(&general);
3957
3958	state->general_bo = sna_static_stream_fini(sna, &general);
3959	return state->general_bo != NULL;
3960}
3961
3962const char *gen8_render_init(struct sna *sna, const char *backend)
3963{
3964	if (!gen8_render_setup(sna))
3965		return backend;
3966
3967	sna->kgem.context_switch = gen6_render_context_switch;
3968	sna->kgem.retire = gen6_render_retire;
3969	sna->kgem.expire = gen4_render_expire;
3970
3971#if !NO_COMPOSITE
3972	sna->render.composite = gen8_render_composite;
3973	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
3974#endif
3975#if !NO_COMPOSITE_SPANS
3976	sna->render.check_composite_spans = gen8_check_composite_spans;
3977	sna->render.composite_spans = gen8_render_composite_spans;
3978	sna->render.prefer_gpu |= PREFER_GPU_SPANS;
3979#endif
3980#if !NO_VIDEO
3981	sna->render.video = gen8_render_video;
3982#endif
3983
3984#if !NO_COPY_BOXES
3985	sna->render.copy_boxes = gen8_render_copy_boxes;
3986#endif
3987#if !NO_COPY
3988	sna->render.copy = gen8_render_copy;
3989#endif
3990
3991#if !NO_FILL_BOXES
3992	sna->render.fill_boxes = gen8_render_fill_boxes;
3993#endif
3994#if !NO_FILL
3995	sna->render.fill = gen8_render_fill;
3996#endif
3997#if !NO_FILL_ONE
3998	sna->render.fill_one = gen8_render_fill_one;
3999#endif
4000#if !NO_FILL_CLEAR
4001	sna->render.clear = gen8_render_clear;
4002#endif
4003
4004	sna->render.flush = gen8_render_flush;
4005	sna->render.reset = gen8_render_reset;
4006	sna->render.fini = gen8_render_fini;
4007
4008	sna->render.max_3d_size = GEN8_MAX_SIZE;
4009	sna->render.max_3d_pitch = 1 << 18;
4010	return "Broadwell";
4011}
4012