1/*
2 * Copyright © 2012,2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 *    Chris Wilson <chris@chris-wilson.co.uk>
25 *
26 */
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "sna.h"
33#include "sna_reg.h"
34#include "sna_render.h"
35#include "sna_render_inline.h"
36#include "sna_video.h"
37
38#include "gen8_render.h"
39#include "gen8_eu.h"
40#include "gen4_common.h"
41#include "gen4_source.h"
42#include "gen4_vertex.h"
43#include "gen6_common.h"
44#include "gen8_vertex.h"
45
46#define SIM 1
47
48#define ALWAYS_INVALIDATE 0
49#define ALWAYS_FLUSH 0
50#define ALWAYS_STALL 0
51
52#define NO_COMPOSITE 0
53#define NO_COMPOSITE_SPANS 0
54#define NO_COPY 0
55#define NO_COPY_BOXES 0
56#define NO_FILL 0
57#define NO_FILL_BOXES 0
58#define NO_FILL_ONE 0
59#define NO_FILL_CLEAR 0
60#define NO_VIDEO 0
61
62#define USE_8_PIXEL_DISPATCH 1
63#define USE_16_PIXEL_DISPATCH 1
64#define USE_32_PIXEL_DISPATCH 0
65
66#if !USE_8_PIXEL_DISPATCH && !USE_16_PIXEL_DISPATCH && !USE_32_PIXEL_DISPATCH
67#error "Must select at least 8, 16 or 32 pixel dispatch"
68#endif
69
70#define GEN8_MAX_SIZE 16384
71
72/* XXX Todo
73 *
74 * STR (software tiled rendering) mode. No, really.
75 * 64x32 pixel blocks align with the rendering cache. Worth considering.
76 */
77
78#define is_aligned(x, y) (((x) & ((y) - 1)) == 0)
79
80/* Pipeline stages:
81 *  1. Command Streamer (CS)
82 *  2. Vertex Fetch (VF)
83 *  3. Vertex Shader (VS)
84 *  4. Hull Shader (HS)
85 *  5. Tesselation Engine (TE)
86 *  6. Domain Shader (DS)
87 *  7. Geometry Shader (GS)
88 *  8. Stream Output Logic (SOL)
89 *  9. Clipper (CLIP)
90 * 10. Strip/Fan (SF)
91 * 11. Windower/Masker (WM)
92 * 12. Color Calculator (CC)
93 */
94
95#if !NO_VIDEO
96static const uint32_t ps_kernel_packed_bt601[][4] = {
97#include "exa_wm_src_affine.g8b"
98#include "exa_wm_src_sample_argb.g8b"
99#include "exa_wm_yuv_rgb_bt601.g8b"
100#include "exa_wm_write.g8b"
101};
102
103static const uint32_t ps_kernel_planar_bt601[][4] = {
104#include "exa_wm_src_affine.g8b"
105#include "exa_wm_src_sample_planar.g8b"
106#include "exa_wm_yuv_rgb_bt601.g8b"
107#include "exa_wm_write.g8b"
108};
109
110static const uint32_t ps_kernel_nv12_bt601[][4] = {
111#include "exa_wm_src_affine.g8b"
112#include "exa_wm_src_sample_nv12.g8b"
113#include "exa_wm_yuv_rgb_bt601.g8b"
114#include "exa_wm_write.g8b"
115};
116
117static const uint32_t ps_kernel_packed_bt709[][4] = {
118#include "exa_wm_src_affine.g8b"
119#include "exa_wm_src_sample_argb.g8b"
120#include "exa_wm_yuv_rgb_bt709.g8b"
121#include "exa_wm_write.g8b"
122};
123
124static const uint32_t ps_kernel_planar_bt709[][4] = {
125#include "exa_wm_src_affine.g8b"
126#include "exa_wm_src_sample_planar.g8b"
127#include "exa_wm_yuv_rgb_bt709.g8b"
128#include "exa_wm_write.g8b"
129};
130
131static const uint32_t ps_kernel_nv12_bt709[][4] = {
132#include "exa_wm_src_affine.g8b"
133#include "exa_wm_src_sample_nv12.g8b"
134#include "exa_wm_yuv_rgb_bt709.g8b"
135#include "exa_wm_write.g8b"
136};
137
138static const uint32_t ps_kernel_rgb[][4] = {
139#include "exa_wm_src_affine.g8b"
140#include "exa_wm_src_sample_argb.g8b"
141#include "exa_wm_write.g8b"
142};
143#endif
144
145#define SURFACE_DW (64 / sizeof(uint32_t));
146
147#define KERNEL(kernel_enum, kernel, num_surfaces) \
148    [GEN8_WM_KERNEL_##kernel_enum] = {#kernel_enum, kernel, sizeof(kernel), num_surfaces}
149#define NOKERNEL(kernel_enum, func, num_surfaces) \
150    [GEN8_WM_KERNEL_##kernel_enum] = {#kernel_enum, (void *)func, 0, num_surfaces}
151static const struct wm_kernel_info {
152	const char *name;
153	const void *data;
154	unsigned int size;
155	int num_surfaces;
156} wm_kernels[GEN8_WM_KERNEL_COUNT] = {
157	NOKERNEL(NOMASK, gen8_wm_kernel__affine, 2),
158	NOKERNEL(NOMASK_P, gen8_wm_kernel__projective, 2),
159
160	NOKERNEL(MASK, gen8_wm_kernel__affine_mask, 3),
161	NOKERNEL(MASK_P, gen8_wm_kernel__projective_mask, 3),
162
163	NOKERNEL(MASKCA, gen8_wm_kernel__affine_mask_ca, 3),
164	NOKERNEL(MASKCA_P, gen8_wm_kernel__projective_mask_ca, 3),
165
166	NOKERNEL(MASKSA, gen8_wm_kernel__affine_mask_sa, 3),
167	NOKERNEL(MASKSA_P, gen8_wm_kernel__projective_mask_sa, 3),
168
169	NOKERNEL(OPACITY, gen8_wm_kernel__affine_opacity, 2),
170	NOKERNEL(OPACITY_P, gen8_wm_kernel__projective_opacity, 2),
171
172#if !NO_VIDEO
173	KERNEL(VIDEO_PLANAR_BT601, ps_kernel_planar_bt601, 7),
174	KERNEL(VIDEO_NV12_BT601, ps_kernel_nv12_bt601, 7),
175	KERNEL(VIDEO_PACKED_BT601, ps_kernel_packed_bt601, 2),
176	KERNEL(VIDEO_PLANAR_BT709, ps_kernel_planar_bt709, 7),
177	KERNEL(VIDEO_NV12_BT709, ps_kernel_nv12_bt709, 7),
178	KERNEL(VIDEO_PACKED_BT709, ps_kernel_packed_bt709, 2),
179	KERNEL(VIDEO_RGB, ps_kernel_rgb, 2),
180#endif
181};
182#undef KERNEL
183
184static const struct blendinfo {
185	uint8_t src_alpha;
186	uint8_t src_blend;
187	uint8_t dst_blend;
188} gen8_blend_op[] = {
189	/* Clear */	{0, BLENDFACTOR_ZERO, BLENDFACTOR_ZERO},
190	/* Src */	{0, BLENDFACTOR_ONE, BLENDFACTOR_ZERO},
191	/* Dst */	{0, BLENDFACTOR_ZERO, BLENDFACTOR_ONE},
192	/* Over */	{1, BLENDFACTOR_ONE, BLENDFACTOR_INV_SRC_ALPHA},
193	/* OverReverse */ {0, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_ONE},
194	/* In */	{0, BLENDFACTOR_DST_ALPHA, BLENDFACTOR_ZERO},
195	/* InReverse */	{1, BLENDFACTOR_ZERO, BLENDFACTOR_SRC_ALPHA},
196	/* Out */	{0, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_ZERO},
197	/* OutReverse */ {1, BLENDFACTOR_ZERO, BLENDFACTOR_INV_SRC_ALPHA},
198	/* Atop */	{1, BLENDFACTOR_DST_ALPHA, BLENDFACTOR_INV_SRC_ALPHA},
199	/* AtopReverse */ {1, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_SRC_ALPHA},
200	/* Xor */	{1, BLENDFACTOR_INV_DST_ALPHA, BLENDFACTOR_INV_SRC_ALPHA},
201	/* Add */	{0, BLENDFACTOR_ONE, BLENDFACTOR_ONE},
202};
203
204/**
205 * Highest-valued BLENDFACTOR used in gen8_blend_op.
206 *
207 * This leaves out GEN8_BLENDFACTOR_INV_DST_COLOR,
208 * GEN8_BLENDFACTOR_INV_CONST_{COLOR,ALPHA},
209 * GEN8_BLENDFACTOR_INV_SRC1_{COLOR,ALPHA}
210 */
211#define GEN8_BLENDFACTOR_COUNT (BLENDFACTOR_INV_DST_ALPHA + 1)
212
213#define GEN8_BLEND_STATE_PADDED_SIZE	ALIGN(sizeof(struct gen8_blend_state), 64)
214
215#define BLEND_OFFSET(s, d) \
216	((d != BLENDFACTOR_ZERO) << 15 | ((s) * GEN8_BLENDFACTOR_COUNT + (d)) << 4)
217
218#define NO_BLEND BLEND_OFFSET(BLENDFACTOR_ONE, BLENDFACTOR_ZERO)
219#define CLEAR BLEND_OFFSET(BLENDFACTOR_ZERO, BLENDFACTOR_ZERO)
220
221#define SAMPLER_OFFSET(sf, se, mf, me) \
222	(((((sf) * EXTEND_COUNT + (se)) * FILTER_COUNT + (mf)) * EXTEND_COUNT + (me)) + 2)
223
224#define VERTEX_2s2s 0
225
226#define COPY_SAMPLER 0
227#define COPY_VERTEX VERTEX_2s2s
228#define COPY_FLAGS(a) GEN8_SET_FLAGS(COPY_SAMPLER, (a) == GXcopy ? NO_BLEND : CLEAR, GEN8_WM_KERNEL_NOMASK, COPY_VERTEX)
229
230#define FILL_SAMPLER 1
231#define FILL_VERTEX VERTEX_2s2s
232#define FILL_FLAGS(op, format) GEN8_SET_FLAGS(FILL_SAMPLER, gen8_get_blend((op), false, (format)), GEN8_WM_KERNEL_NOMASK, FILL_VERTEX)
233#define FILL_FLAGS_NOBLEND GEN8_SET_FLAGS(FILL_SAMPLER, NO_BLEND, GEN8_WM_KERNEL_NOMASK, FILL_VERTEX)
234
235#define GEN8_SAMPLER(f) (((f) >> 20) & 0xfff)
236#define GEN8_BLEND(f) (((f) >> 4) & 0x7ff)
237#define GEN8_READS_DST(f) (((f) >> 15) & 1)
238#define GEN8_KERNEL(f) (((f) >> 16) & 0xf)
239#define GEN8_VERTEX(f) (((f) >> 0) & 0xf)
240#define GEN8_SET_FLAGS(S, B, K, V)  ((S) << 20 | (K) << 16 | (B) | (V))
241
242#define OUT_BATCH(v) batch_emit(sna, v)
243#define OUT_BATCH64(v) batch_emit64(sna, v)
244#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
245#define OUT_VERTEX_F(v) vertex_emit(sna, v)
246
247struct gt_info {
248	const char *name;
249	struct {
250		int max_vs_entries;
251	} urb;
252};
253
254static const struct gt_info bdw_gt_info = {
255	.name = "Broadwell (gen8)",
256	.urb = { .max_vs_entries = 960 },
257};
258
259static bool is_bdw(struct sna *sna)
260{
261	return sna->kgem.gen == 0100;
262}
263
264static const struct gt_info chv_gt_info = {
265	.name = "Cherryview (gen8)",
266	.urb = { .max_vs_entries = 640 },
267};
268
269static bool is_chv(struct sna *sna)
270{
271	return sna->kgem.gen == 0101;
272}
273
274static inline bool too_large(int width, int height)
275{
276	return width > GEN8_MAX_SIZE || height > GEN8_MAX_SIZE;
277}
278
279static inline bool unaligned(struct kgem_bo *bo, int bpp)
280{
281	/* XXX What exactly do we need to meet H_ALIGN and V_ALIGN? */
282#if 0
283	int x, y;
284
285	if (bo->proxy == NULL)
286		return false;
287
288	/* Assume that all tiled proxies are constructed correctly. */
289	if (bo->tiling)
290		return false;
291
292	DBG(("%s: checking alignment of a linear proxy, offset=%d, pitch=%d, bpp=%d: => (%d, %d)\n",
293	     __FUNCTION__, bo->delta, bo->pitch, bpp,
294	     8 * (bo->delta % bo->pitch) / bpp, bo->delta / bo->pitch));
295
296	/* This may be a random userptr map, check that it meets the
297	 * render alignment of SURFACE_VALIGN_4 | SURFACE_HALIGN_4.
298	 */
299	y = bo->delta / bo->pitch;
300	if (y & 3)
301		return true;
302
303	x = 8 * (bo->delta - y * bo->pitch);
304	if (x & (4*bpp - 1))
305	    return true;
306
307	return false;
308#else
309	return false;
310#endif
311}
312
313static uint32_t gen8_get_blend(int op,
314			       bool has_component_alpha,
315			       uint32_t dst_format)
316{
317	uint32_t src, dst;
318
319	COMPILE_TIME_ASSERT(BLENDFACTOR_INV_DST_ALPHA*GEN8_BLENDFACTOR_COUNT + BLENDFACTOR_INV_DST_ALPHA <= 0x7ff);
320
321	src = gen8_blend_op[op].src_blend;
322	dst = gen8_blend_op[op].dst_blend;
323
324	/* If there's no dst alpha channel, adjust the blend op so that
325	 * we'll treat it always as 1.
326	 */
327	if (PICT_FORMAT_A(dst_format) == 0) {
328		if (src == BLENDFACTOR_DST_ALPHA)
329			src = BLENDFACTOR_ONE;
330		else if (src == BLENDFACTOR_INV_DST_ALPHA)
331			src = BLENDFACTOR_ZERO;
332	}
333
334	/* If the source alpha is being used, then we should only be in a
335	 * case where the source blend factor is 0, and the source blend
336	 * value is the mask channels multiplied by the source picture's alpha.
337	 */
338	if (has_component_alpha && gen8_blend_op[op].src_alpha) {
339		if (dst == BLENDFACTOR_SRC_ALPHA)
340			dst = BLENDFACTOR_SRC_COLOR;
341		else if (dst == BLENDFACTOR_INV_SRC_ALPHA)
342			dst = BLENDFACTOR_INV_SRC_COLOR;
343	}
344
345	DBG(("blend op=%d, dst=%x [A=%d] => src=%d, dst=%d => offset=%x\n",
346	     op, dst_format, PICT_FORMAT_A(dst_format),
347	     src, dst, (int)(BLEND_OFFSET(src, dst)>>4)));
348	assert(BLEND_OFFSET(src, dst) >> 4 <= 0xfff);
349	return BLEND_OFFSET(src, dst);
350}
351
352static uint32_t gen8_get_card_format(PictFormat format)
353{
354	switch (format) {
355	default:
356		return -1;
357	case PICT_a8r8g8b8:
358		return SURFACEFORMAT_B8G8R8A8_UNORM;
359	case PICT_x8r8g8b8:
360		return SURFACEFORMAT_B8G8R8X8_UNORM;
361	case PICT_a8b8g8r8:
362		return SURFACEFORMAT_R8G8B8A8_UNORM;
363	case PICT_x8b8g8r8:
364		return SURFACEFORMAT_R8G8B8X8_UNORM;
365#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
366	case PICT_a2r10g10b10:
367		return SURFACEFORMAT_B10G10R10A2_UNORM;
368	case PICT_x2r10g10b10:
369		return SURFACEFORMAT_B10G10R10X2_UNORM;
370#endif
371	case PICT_r8g8b8:
372		return SURFACEFORMAT_R8G8B8_UNORM;
373	case PICT_r5g6b5:
374		return SURFACEFORMAT_B5G6R5_UNORM;
375	case PICT_a1r5g5b5:
376		return SURFACEFORMAT_B5G5R5A1_UNORM;
377	case PICT_a8:
378		return SURFACEFORMAT_A8_UNORM;
379	case PICT_a4r4g4b4:
380		return SURFACEFORMAT_B4G4R4A4_UNORM;
381	}
382}
383
384static uint32_t gen8_get_dest_format(PictFormat format)
385{
386	switch (format) {
387	default:
388		return -1;
389	case PICT_a8r8g8b8:
390	case PICT_x8r8g8b8:
391		return SURFACEFORMAT_B8G8R8A8_UNORM;
392	case PICT_a8b8g8r8:
393	case PICT_x8b8g8r8:
394		return SURFACEFORMAT_R8G8B8A8_UNORM;
395#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
396	case PICT_a2r10g10b10:
397	case PICT_x2r10g10b10:
398		return SURFACEFORMAT_B10G10R10A2_UNORM;
399#endif
400	case PICT_r5g6b5:
401		return SURFACEFORMAT_B5G6R5_UNORM;
402	case PICT_x1r5g5b5:
403	case PICT_a1r5g5b5:
404		return SURFACEFORMAT_B5G5R5A1_UNORM;
405	case PICT_a8:
406		return SURFACEFORMAT_A8_UNORM;
407	case PICT_a4r4g4b4:
408	case PICT_x4r4g4b4:
409		return SURFACEFORMAT_B4G4R4A4_UNORM;
410	}
411}
412
413static bool gen8_check_dst_format(PictFormat format)
414{
415	if (gen8_get_dest_format(format) != -1)
416		return true;
417
418	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
419	return false;
420}
421
422static bool gen8_check_format(uint32_t format)
423{
424	if (gen8_get_card_format(format) != -1)
425		return true;
426
427	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
428	return false;
429}
430
431static uint32_t gen8_filter(uint32_t filter)
432{
433	switch (filter) {
434	default:
435		assert(0);
436	case PictFilterNearest:
437		return SAMPLER_FILTER_NEAREST;
438	case PictFilterBilinear:
439		return SAMPLER_FILTER_BILINEAR;
440	}
441}
442
443static uint32_t gen8_check_filter(PicturePtr picture)
444{
445	switch (picture->filter) {
446	case PictFilterNearest:
447	case PictFilterBilinear:
448		return true;
449	default:
450		return false;
451	}
452}
453
454static uint32_t gen8_repeat(uint32_t repeat)
455{
456	switch (repeat) {
457	default:
458		assert(0);
459	case RepeatNone:
460		return SAMPLER_EXTEND_NONE;
461	case RepeatNormal:
462		return SAMPLER_EXTEND_REPEAT;
463	case RepeatPad:
464		return SAMPLER_EXTEND_PAD;
465	case RepeatReflect:
466		return SAMPLER_EXTEND_REFLECT;
467	}
468}
469
470static bool gen8_check_repeat(PicturePtr picture)
471{
472	if (!picture->repeat)
473		return true;
474
475	switch (picture->repeatType) {
476	case RepeatNone:
477	case RepeatNormal:
478	case RepeatPad:
479	case RepeatReflect:
480		return true;
481	default:
482		return false;
483	}
484}
485
486static int
487gen8_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
488{
489	int base;
490
491	if (has_mask) {
492		if (is_ca) {
493			if (gen8_blend_op[op].src_alpha)
494				base = GEN8_WM_KERNEL_MASKSA;
495			else
496				base = GEN8_WM_KERNEL_MASKCA;
497		} else
498			base = GEN8_WM_KERNEL_MASK;
499	} else
500		base = GEN8_WM_KERNEL_NOMASK;
501
502	return base + !is_affine;
503}
504
505static void
506gen8_emit_push_constants(struct sna *sna)
507{
508#if SIM
509	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_VS | (2 - 2));
510	OUT_BATCH(0);
511
512	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_HS | (2 - 2));
513	OUT_BATCH(0);
514
515	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_DS | (2 - 2));
516	OUT_BATCH(0);
517
518	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_GS | (2 - 2));
519	OUT_BATCH(0);
520
521	OUT_BATCH(GEN8_3DSTATE_PUSH_CONSTANT_ALLOC_PS | (2 - 2));
522	OUT_BATCH(0);
523#endif
524}
525
526static void
527gen8_emit_urb(struct sna *sna)
528{
529	/* num of VS entries must be divisible by 8 if size < 9 */
530	OUT_BATCH(GEN8_3DSTATE_URB_VS | (2 - 2));
531	OUT_BATCH(sna->render_state.gen8.info->urb.max_vs_entries << URB_ENTRY_NUMBER_SHIFT |
532		  (2 - 1) << URB_ENTRY_SIZE_SHIFT |
533		  4 << URB_STARTING_ADDRESS_SHIFT);
534
535	OUT_BATCH(GEN8_3DSTATE_URB_HS | (2 - 2));
536	OUT_BATCH(0 << URB_ENTRY_SIZE_SHIFT |
537		  4 << URB_STARTING_ADDRESS_SHIFT);
538
539	OUT_BATCH(GEN8_3DSTATE_URB_DS | (2 - 2));
540	OUT_BATCH(0 << URB_ENTRY_SIZE_SHIFT |
541		  4 << URB_STARTING_ADDRESS_SHIFT);
542
543	OUT_BATCH(GEN8_3DSTATE_URB_GS | (2 - 2));
544	OUT_BATCH(0 << URB_ENTRY_SIZE_SHIFT |
545		  4 << URB_STARTING_ADDRESS_SHIFT);
546}
547
548static void
549gen8_emit_state_base_address(struct sna *sna)
550{
551	uint32_t num_pages;
552
553	assert(sna->kgem.surface - sna->kgem.nbatch <= 16384);
554
555	OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (16 - 2));
556	OUT_BATCH64(0); /* general */
557	OUT_BATCH(0); /* stateless dataport */
558	OUT_BATCH64(kgem_add_reloc64(&sna->kgem, /* surface */
559				     sna->kgem.nbatch,
560				     NULL,
561				     I915_GEM_DOMAIN_INSTRUCTION << 16,
562				     BASE_ADDRESS_MODIFY));
563	OUT_BATCH64(kgem_add_reloc64(&sna->kgem, /* dynamic */
564				     sna->kgem.nbatch,
565				     sna->render_state.gen8.general_bo,
566				     I915_GEM_DOMAIN_INSTRUCTION << 16,
567				     BASE_ADDRESS_MODIFY));
568	OUT_BATCH64(0); /* indirect */
569	OUT_BATCH64(kgem_add_reloc64(&sna->kgem, /* instruction */
570				     sna->kgem.nbatch,
571				     sna->render_state.gen8.general_bo,
572				     I915_GEM_DOMAIN_INSTRUCTION << 16,
573				     BASE_ADDRESS_MODIFY));
574	/* upper bounds */
575	num_pages = sna->render_state.gen8.general_bo->size.pages.count;
576	OUT_BATCH(0); /* general */
577	OUT_BATCH(num_pages << 12 | 1); /* dynamic */
578	OUT_BATCH(0); /* indirect */
579	OUT_BATCH(num_pages << 12 | 1); /* instruction */
580}
581
582static void
583gen8_emit_vs_invariant(struct sna *sna)
584{
585	OUT_BATCH(GEN8_3DSTATE_VS | (9 - 2));
586	OUT_BATCH64(0); /* no VS kernel */
587	OUT_BATCH(0);
588	OUT_BATCH64(0); /* scratch */
589	OUT_BATCH(0);
590	OUT_BATCH(1 << 1); /* pass-through */
591	OUT_BATCH(1 << 16 | 1 << 21); /* urb write to SBE */
592
593#if SIM
594	OUT_BATCH(GEN8_3DSTATE_CONSTANT_VS | (11 - 2));
595	OUT_BATCH(0);
596	OUT_BATCH(0);
597	OUT_BATCH64(0);
598	OUT_BATCH64(0);
599	OUT_BATCH64(0);
600	OUT_BATCH64(0);
601
602	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_VS | (2 - 2));
603	OUT_BATCH(0);
604
605	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_VS | (2 - 2));
606	OUT_BATCH(0);
607#endif
608}
609
610static void
611gen8_emit_hs_invariant(struct sna *sna)
612{
613	OUT_BATCH(GEN8_3DSTATE_HS | (9 - 2));
614	OUT_BATCH(0);
615	OUT_BATCH(0);
616	OUT_BATCH64(0); /* no HS kernel */
617	OUT_BATCH64(0); /* scratch */
618	OUT_BATCH(0);
619	OUT_BATCH(0); /* pass-through */
620
621#if SIM
622	OUT_BATCH(GEN8_3DSTATE_CONSTANT_HS | (11 - 2));
623	OUT_BATCH(0);
624	OUT_BATCH(0);
625	OUT_BATCH64(0);
626	OUT_BATCH64(0);
627	OUT_BATCH64(0);
628	OUT_BATCH64(0);
629
630#if 1
631	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_HS | (2 - 2));
632	OUT_BATCH(0);
633
634	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_HS | (2 - 2));
635	OUT_BATCH(0);
636#endif
637#endif
638}
639
640static void
641gen8_emit_te_invariant(struct sna *sna)
642{
643	OUT_BATCH(GEN8_3DSTATE_TE | (4 - 2));
644	OUT_BATCH(0);
645	OUT_BATCH(0);
646	OUT_BATCH(0);
647}
648
649static void
650gen8_emit_ds_invariant(struct sna *sna)
651{
652	OUT_BATCH(GEN8_3DSTATE_DS | (9 - 2));
653	OUT_BATCH64(0); /* no kernel */
654	OUT_BATCH(0);
655	OUT_BATCH64(0); /* scratch */
656	OUT_BATCH(0);
657	OUT_BATCH(0);
658	OUT_BATCH(0);
659
660#if SIM
661	OUT_BATCH(GEN8_3DSTATE_CONSTANT_DS | (11 - 2));
662	OUT_BATCH(0);
663	OUT_BATCH(0);
664	OUT_BATCH64(0);
665	OUT_BATCH64(0);
666	OUT_BATCH64(0);
667	OUT_BATCH64(0);
668
669#if 1
670	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_DS | (2 - 2));
671	OUT_BATCH(0);
672
673	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_DS | (2 - 2));
674	OUT_BATCH(0);
675#endif
676#endif
677}
678
679static void
680gen8_emit_gs_invariant(struct sna *sna)
681{
682	OUT_BATCH(GEN8_3DSTATE_GS | (10 - 2));
683	OUT_BATCH64(0); /* no GS kernel */
684	OUT_BATCH(0);
685	OUT_BATCH64(0); /* scratch */
686	OUT_BATCH(0);
687	OUT_BATCH(0); /* pass-through */
688	OUT_BATCH(0);
689	OUT_BATCH(0);
690
691#if SIM
692	OUT_BATCH(GEN8_3DSTATE_CONSTANT_GS | (11 - 2));
693	OUT_BATCH(0);
694	OUT_BATCH(0);
695	OUT_BATCH64(0);
696	OUT_BATCH64(0);
697	OUT_BATCH64(0);
698	OUT_BATCH64(0);
699
700#if 1
701	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_GS | (2 - 2));
702	OUT_BATCH(0);
703
704	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_GS | (2 - 2));
705	OUT_BATCH(0);
706#endif
707#endif
708}
709
710static void
711gen8_emit_sol_invariant(struct sna *sna)
712{
713	OUT_BATCH(GEN8_3DSTATE_STREAMOUT | (5 - 2));
714	OUT_BATCH(0);
715	OUT_BATCH(0);
716	OUT_BATCH(0);
717	OUT_BATCH(0);
718}
719
720static void
721gen8_emit_sf_invariant(struct sna *sna)
722{
723	OUT_BATCH(GEN8_3DSTATE_SF | (4 - 2));
724	OUT_BATCH(0);
725	OUT_BATCH(0);
726	OUT_BATCH(0);
727}
728
729static void
730gen8_emit_clip_invariant(struct sna *sna)
731{
732	OUT_BATCH(GEN8_3DSTATE_CLIP | (4 - 2));
733	OUT_BATCH(0);
734	OUT_BATCH(0); /* pass-through */
735	OUT_BATCH(0);
736
737	OUT_BATCH(GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP | (2 - 2));
738	OUT_BATCH(0);
739
740	OUT_BATCH(GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_CC | (2 - 2));
741	OUT_BATCH(0);
742}
743
744static void
745gen8_emit_null_depth_buffer(struct sna *sna)
746{
747	OUT_BATCH(GEN8_3DSTATE_DEPTH_BUFFER | (8 - 2));
748#if 0
749	OUT_BATCH(SURFACE_NULL << DEPTH_BUFFER_TYPE_SHIFT |
750		  DEPTHFORMAT_D32_FLOAT << DEPTH_BUFFER_FORMAT_SHIFT);
751#else
752	OUT_BATCH(SURFACE_2D << DEPTH_BUFFER_TYPE_SHIFT |
753		  DEPTHFORMAT_D16_UNORM << DEPTH_BUFFER_FORMAT_SHIFT);
754#endif
755	OUT_BATCH64(0);
756	OUT_BATCH(0);
757	OUT_BATCH(0);
758	OUT_BATCH(0);
759	OUT_BATCH(0);
760
761#if SIM
762	OUT_BATCH(GEN8_3DSTATE_HIER_DEPTH_BUFFER | (5 - 2));
763	OUT_BATCH(0);
764	OUT_BATCH64(0);
765	OUT_BATCH(0);
766#endif
767
768#if SIM
769	OUT_BATCH(GEN8_3DSTATE_STENCIL_BUFFER | (5 - 2));
770	OUT_BATCH(0);
771	OUT_BATCH64(0);
772	OUT_BATCH(0);
773#endif
774
775#if SIM
776	OUT_BATCH(GEN8_3DSTATE_WM_DEPTH_STENCIL | (3 - 2));
777	OUT_BATCH(0);
778	OUT_BATCH(0);
779#endif
780
781#if SIM
782	OUT_BATCH(GEN8_3DSTATE_CLEAR_PARAMS | (3 - 2));
783	OUT_BATCH(0);
784	OUT_BATCH(0);
785#endif
786}
787
788static void
789gen8_emit_wm_invariant(struct sna *sna)
790{
791	gen8_emit_null_depth_buffer(sna);
792
793#if SIM
794	OUT_BATCH(GEN8_3DSTATE_SCISSOR_STATE_POINTERS | (2 - 2));
795	OUT_BATCH(0);
796#endif
797
798	OUT_BATCH(GEN8_3DSTATE_WM | (2 - 2));
799	//OUT_BATCH(WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC); /* XXX */
800	OUT_BATCH(WM_PERSPECTIVE_PIXEL_BARYCENTRIC);
801
802#if SIM
803	OUT_BATCH(GEN8_3DSTATE_WM_CHROMAKEY | (2 - 2));
804	OUT_BATCH(0);
805#endif
806
807#if 0
808	OUT_BATCH(GEN8_3DSTATE_WM_HZ_OP | (5 - 2));
809	OUT_BATCH(0);
810	OUT_BATCH(0);
811	OUT_BATCH(0);
812	OUT_BATCH(0);
813#endif
814
815	OUT_BATCH(GEN8_3DSTATE_PS_EXTRA | (2 - 2));
816	OUT_BATCH(PSX_PIXEL_SHADER_VALID |
817		  PSX_ATTRIBUTE_ENABLE);
818
819	OUT_BATCH(GEN8_3DSTATE_RASTER | (5 - 2));
820	OUT_BATCH(RASTER_FRONT_WINDING_CCW |
821		  RASTER_CULL_NONE);
822	OUT_BATCH(0);
823	OUT_BATCH(0);
824	OUT_BATCH(0);
825
826	OUT_BATCH(GEN8_3DSTATE_SBE_SWIZ | (11 - 2));
827	OUT_BATCH(0);
828	OUT_BATCH(0);
829	OUT_BATCH(0);
830	OUT_BATCH(0);
831	OUT_BATCH(0);
832	OUT_BATCH(0);
833	OUT_BATCH(0);
834	OUT_BATCH(0);
835	OUT_BATCH(0);
836	OUT_BATCH(0);
837
838#if SIM
839	OUT_BATCH(GEN8_3DSTATE_CONSTANT_PS | (11 - 2));
840	OUT_BATCH(0);
841	OUT_BATCH(0);
842	OUT_BATCH64(0);
843	OUT_BATCH64(0);
844	OUT_BATCH64(0);
845	OUT_BATCH64(0);
846#endif
847}
848
849static void
850gen8_emit_cc_invariant(struct sna *sna)
851{
852}
853
854static void
855gen8_emit_vf_invariant(struct sna *sna)
856{
857	int n;
858
859#if 1
860	OUT_BATCH(GEN8_3DSTATE_VF | (2 - 2));
861	OUT_BATCH(0);
862#endif
863
864	OUT_BATCH(GEN8_3DSTATE_VF_SGVS | (2 - 2));
865	OUT_BATCH(0);
866
867	OUT_BATCH(GEN8_3DSTATE_VF_TOPOLOGY | (2 - 2));
868	OUT_BATCH(RECTLIST);
869
870	OUT_BATCH(GEN8_3DSTATE_VF_STATISTICS | 0);
871
872	for (n = 1; n <= 3; n++) {
873		OUT_BATCH(GEN8_3DSTATE_VF_INSTANCING | (3 - 2));
874		OUT_BATCH(n);
875		OUT_BATCH(0);
876	}
877}
878
879static void
880gen8_emit_invariant(struct sna *sna)
881{
882	OUT_BATCH(GEN8_PIPELINE_SELECT | PIPELINE_SELECT_3D);
883
884#if SIM
885	OUT_BATCH(GEN8_STATE_SIP | (3 - 2));
886	OUT_BATCH64(0);
887#endif
888
889	OUT_BATCH(GEN8_3DSTATE_MULTISAMPLE | (2 - 2));
890	OUT_BATCH(MULTISAMPLE_PIXEL_LOCATION_CENTER |
891		  MULTISAMPLE_NUMSAMPLES_1); /* 1 sample/pixel */
892
893	OUT_BATCH(GEN8_3DSTATE_SAMPLE_MASK | (2 - 2));
894	OUT_BATCH(1);
895
896#if SIM
897	OUT_BATCH(GEN8_3DSTATE_SAMPLE_PATTERN | (5 - 2));
898	OUT_BATCH(0);
899	OUT_BATCH(0);
900	OUT_BATCH(0);
901	//OUT_BATCH(8<<20 | 8<<16);
902	OUT_BATCH(0);
903#endif
904
905	gen8_emit_push_constants(sna);
906	gen8_emit_urb(sna);
907
908	gen8_emit_state_base_address(sna);
909
910	gen8_emit_vf_invariant(sna);
911	gen8_emit_vs_invariant(sna);
912	gen8_emit_hs_invariant(sna);
913	gen8_emit_te_invariant(sna);
914	gen8_emit_ds_invariant(sna);
915	gen8_emit_gs_invariant(sna);
916	gen8_emit_sol_invariant(sna);
917	gen8_emit_clip_invariant(sna);
918	gen8_emit_sf_invariant(sna);
919	gen8_emit_wm_invariant(sna);
920	gen8_emit_cc_invariant(sna);
921
922	sna->render_state.gen8.needs_invariant = false;
923}
924
925static void
926gen8_emit_cc(struct sna *sna, uint32_t blend)
927{
928	struct gen8_render_state *render = &sna->render_state.gen8;
929
930	if (render->blend == blend)
931		return;
932
933	DBG(("%s: blend=%x (current=%x), src=%d, dst=%d\n",
934	     __FUNCTION__, blend, render->blend,
935	     blend / GEN8_BLENDFACTOR_COUNT,
936	     blend % GEN8_BLENDFACTOR_COUNT));
937
938	assert(blend < GEN8_BLENDFACTOR_COUNT * GEN8_BLENDFACTOR_COUNT);
939	assert(blend / GEN8_BLENDFACTOR_COUNT > 0);
940	assert(blend % GEN8_BLENDFACTOR_COUNT > 0);
941
942	/* XXX can have up to 8 blend states preload, selectable via
943	 * Render Target Index. What other side-effects of Render Target Index?
944	 */
945
946	OUT_BATCH(GEN8_3DSTATE_PS_BLEND | (2 - 2));
947	if (blend != GEN8_BLEND(NO_BLEND)) {
948		uint32_t src = blend / GEN8_BLENDFACTOR_COUNT;
949		uint32_t dst = blend % GEN8_BLENDFACTOR_COUNT;
950		OUT_BATCH(PS_BLEND_HAS_WRITEABLE_RT |
951			  PS_BLEND_COLOR_BLEND_ENABLE |
952			  src << PS_BLEND_SRC_ALPHA_SHIFT |
953			  dst << PS_BLEND_DST_ALPHA_SHIFT |
954			  src << PS_BLEND_SRC_SHIFT |
955			  dst << PS_BLEND_DST_SHIFT);
956	} else
957		OUT_BATCH(PS_BLEND_HAS_WRITEABLE_RT);
958
959	assert(is_aligned(render->cc_blend + blend * GEN8_BLEND_STATE_PADDED_SIZE, 64));
960	OUT_BATCH(GEN8_3DSTATE_BLEND_STATE_POINTERS | (2 - 2));
961	OUT_BATCH((render->cc_blend + blend * GEN8_BLEND_STATE_PADDED_SIZE) | 1);
962
963	/* Force a CC_STATE pointer change to improve blend performance */
964	OUT_BATCH(GEN8_3DSTATE_CC_STATE_POINTERS | (2 - 2));
965	OUT_BATCH(0);
966
967	render->blend = blend;
968}
969
970static void
971gen8_emit_sampler(struct sna *sna, uint32_t state)
972{
973	if (sna->render_state.gen8.samplers == state)
974		return;
975
976	sna->render_state.gen8.samplers = state;
977
978	DBG(("%s: sampler = %x\n", __FUNCTION__, state));
979
980	assert(2 * sizeof(struct gen8_sampler_state) == 32);
981	OUT_BATCH(GEN8_3DSTATE_SAMPLER_STATE_POINTERS_PS | (2 - 2));
982	OUT_BATCH(sna->render_state.gen8.wm_state + state * 2 * sizeof(struct gen8_sampler_state));
983}
984
985static void
986gen8_emit_sf(struct sna *sna, bool has_mask)
987{
988	int num_sf_outputs = has_mask ? 2 : 1;
989
990	if (sna->render_state.gen8.num_sf_outputs == num_sf_outputs)
991		return;
992
993	DBG(("%s: num_sf_outputs=%d\n", __FUNCTION__, num_sf_outputs));
994
995	sna->render_state.gen8.num_sf_outputs = num_sf_outputs;
996
997	OUT_BATCH(GEN8_3DSTATE_SBE | (4 - 2));
998	OUT_BATCH(num_sf_outputs << SBE_NUM_OUTPUTS_SHIFT |
999		  SBE_FORCE_VERTEX_URB_READ_LENGTH | /* forced is faster */
1000		  SBE_FORCE_VERTEX_URB_READ_OFFSET |
1001		  1 << SBE_URB_ENTRY_READ_LENGTH_SHIFT |
1002		  1 << SBE_URB_ENTRY_READ_OFFSET_SHIFT);
1003	OUT_BATCH(0);
1004	OUT_BATCH(0);
1005}
1006
1007static void
1008gen8_emit_wm(struct sna *sna, int kernel)
1009{
1010	const uint32_t *kernels;
1011
1012	assert(kernel < ARRAY_SIZE(wm_kernels));
1013	if (sna->render_state.gen8.kernel == kernel)
1014		return;
1015
1016	sna->render_state.gen8.kernel = kernel;
1017	kernels = sna->render_state.gen8.wm_kernel[kernel];
1018
1019	DBG(("%s: switching to %s, num_surfaces=%d (8-wide? %d, 16-wide? %d, 32-wide? %d)\n",
1020	     __FUNCTION__,
1021	     wm_kernels[kernel].name,
1022	     wm_kernels[kernel].num_surfaces,
1023	     kernels[0], kernels[1], kernels[2]));
1024	assert(is_aligned(kernels[0], 64));
1025	assert(is_aligned(kernels[1], 64));
1026	assert(is_aligned(kernels[2], 64));
1027
1028	OUT_BATCH(GEN8_3DSTATE_PS | (12 - 2));
1029	OUT_BATCH64(kernels[0] ?: kernels[1] ?: kernels[2]);
1030	OUT_BATCH(1 << PS_SAMPLER_COUNT_SHIFT |
1031		  PS_VECTOR_MASK_ENABLE |
1032		  wm_kernels[kernel].num_surfaces << PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);
1033	OUT_BATCH64(0); /* scratch address */
1034	OUT_BATCH(PS_MAX_THREADS |
1035		  (kernels[0] ? PS_8_DISPATCH_ENABLE : 0) |
1036		  (kernels[1] ? PS_16_DISPATCH_ENABLE : 0) |
1037		  (kernels[2] ? PS_32_DISPATCH_ENABLE : 0));
1038	OUT_BATCH((kernels[0] ? 4 : kernels[1] ? 6 : 8) << PS_DISPATCH_START_GRF_SHIFT_0 |
1039		  8 << PS_DISPATCH_START_GRF_SHIFT_1 |
1040		  6 << PS_DISPATCH_START_GRF_SHIFT_2);
1041	OUT_BATCH64(kernels[2]);
1042	OUT_BATCH64(kernels[1]);
1043}
1044
1045static bool
1046gen8_emit_binding_table(struct sna *sna, uint16_t offset)
1047{
1048	if (sna->render_state.gen8.surface_table == offset)
1049		return false;
1050
1051	/* Binding table pointers */
1052	assert(is_aligned(4*offset, 32));
1053	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POINTERS_PS | (2 - 2));
1054	OUT_BATCH(offset*4);
1055
1056	sna->render_state.gen8.surface_table = offset;
1057	return true;
1058}
1059
1060static bool
1061gen8_emit_drawing_rectangle(struct sna *sna,
1062			    const struct sna_composite_op *op)
1063{
1064	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
1065	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
1066
1067	assert(!too_large(abs(op->dst.x), abs(op->dst.y)));
1068	assert(!too_large(op->dst.width, op->dst.height));
1069
1070	if (sna->render_state.gen8.drawrect_limit == limit &&
1071	    sna->render_state.gen8.drawrect_offset == offset)
1072		return true;
1073
1074	sna->render_state.gen8.drawrect_offset = offset;
1075	sna->render_state.gen8.drawrect_limit = limit;
1076
1077	OUT_BATCH(GEN8_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
1078	OUT_BATCH(0);
1079	OUT_BATCH(limit);
1080	OUT_BATCH(offset);
1081	return false;
1082}
1083
1084static void
1085gen8_emit_vertex_elements(struct sna *sna,
1086			  const struct sna_composite_op *op)
1087{
1088	/*
1089	 * vertex data in vertex buffer
1090	 *    position: (x, y)
1091	 *    texture coordinate 0: (u0, v0) if (is_affine is true) else (u0, v0, w0)
1092	 *    texture coordinate 1 if (has_mask is true): same as above
1093	 */
1094	struct gen8_render_state *render = &sna->render_state.gen8;
1095	uint32_t src_format, dw;
1096	int id = GEN8_VERTEX(op->u.gen8.flags);
1097	bool has_mask;
1098
1099	DBG(("%s: setup id=%d\n", __FUNCTION__, id));
1100
1101	if (render->ve_id == id)
1102		return;
1103	render->ve_id = id;
1104
1105	/* The VUE layout
1106	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
1107	 *    dword 4-7: position (x, y, 1.0, 1.0),
1108	 *    dword 8-11: texture coordinate 0 (u0, v0, w0, 1.0)
1109	 *    dword 12-15: texture coordinate 1 (u1, v1, w1, 1.0)
1110	 *
1111	 * dword 4-15 are fetched from vertex buffer
1112	 */
1113	has_mask = (id >> 2) != 0;
1114	OUT_BATCH(GEN8_3DSTATE_VERTEX_ELEMENTS |
1115		((2 * (3 + has_mask)) + 1 - 2));
1116
1117	OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
1118		  SURFACEFORMAT_R32G32B32A32_FLOAT << VE_FORMAT_SHIFT |
1119		  0 << VE_OFFSET_SHIFT);
1120	OUT_BATCH(COMPONENT_STORE_0 << VE_COMPONENT_0_SHIFT |
1121		  COMPONENT_STORE_0 << VE_COMPONENT_1_SHIFT |
1122		  COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT |
1123		  COMPONENT_STORE_0 << VE_COMPONENT_3_SHIFT);
1124
1125	/* x,y */
1126	OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
1127		  SURFACEFORMAT_R16G16_SSCALED << VE_FORMAT_SHIFT |
1128		  0 << VE_OFFSET_SHIFT);
1129	OUT_BATCH(COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT |
1130		  COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT |
1131		  COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT |
1132		  COMPONENT_STORE_1_FLT << VE_COMPONENT_3_SHIFT);
1133
1134	/* u0, v0, w0 */
1135	DBG(("%s: first channel %d floats, offset=4\n", __FUNCTION__, id & 3));
1136	dw = COMPONENT_STORE_1_FLT << VE_COMPONENT_3_SHIFT;
1137	switch (id & 3) {
1138	default:
1139		assert(0);
1140	case 0:
1141		src_format = SURFACEFORMAT_R16G16_SSCALED;
1142		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1143		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1144		dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1145		break;
1146	case 1:
1147		src_format = SURFACEFORMAT_R32_FLOAT;
1148		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1149		dw |= COMPONENT_STORE_0 << VE_COMPONENT_1_SHIFT;
1150		dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1151		break;
1152	case 2:
1153		src_format = SURFACEFORMAT_R32G32_FLOAT;
1154		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1155		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1156		dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1157		break;
1158	case 3:
1159		src_format = SURFACEFORMAT_R32G32B32_FLOAT;
1160		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1161		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1162		dw |= COMPONENT_STORE_SRC << VE_COMPONENT_2_SHIFT;
1163		break;
1164	}
1165	OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
1166		  src_format << VE_FORMAT_SHIFT |
1167		  4 << VE_OFFSET_SHIFT);
1168	OUT_BATCH(dw);
1169
1170	/* u1, v1, w1 */
1171	if (has_mask) {
1172		unsigned offset = 4 + ((id & 3) ?: 1) * sizeof(float);
1173		DBG(("%s: second channel %d floats, offset=%d\n", __FUNCTION__, (id >> 2) & 3, offset));
1174		dw = COMPONENT_STORE_1_FLT << VE_COMPONENT_3_SHIFT;
1175		switch (id >> 2) {
1176		case 1:
1177			src_format = SURFACEFORMAT_R32_FLOAT;
1178			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1179			dw |= COMPONENT_STORE_0 << VE_COMPONENT_1_SHIFT;
1180			dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1181			break;
1182		default:
1183			assert(0);
1184		case 2:
1185			src_format = SURFACEFORMAT_R32G32_FLOAT;
1186			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1187			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1188			dw |= COMPONENT_STORE_0 << VE_COMPONENT_2_SHIFT;
1189			break;
1190		case 3:
1191			src_format = SURFACEFORMAT_R32G32B32_FLOAT;
1192			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_0_SHIFT;
1193			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_1_SHIFT;
1194			dw |= COMPONENT_STORE_SRC << VE_COMPONENT_2_SHIFT;
1195			break;
1196		}
1197		OUT_BATCH(id << VE_INDEX_SHIFT | VE_VALID |
1198			  src_format << VE_FORMAT_SHIFT |
1199			  offset << VE_OFFSET_SHIFT);
1200		OUT_BATCH(dw);
1201	}
1202}
1203
1204inline static void
1205gen8_emit_pipe_invalidate(struct sna *sna)
1206{
1207	OUT_BATCH(GEN8_PIPE_CONTROL | (6 - 2));
1208	OUT_BATCH(PIPE_CONTROL_WC_FLUSH |
1209		  PIPE_CONTROL_TC_FLUSH |
1210		  PIPE_CONTROL_CS_STALL);
1211	OUT_BATCH64(0);
1212	OUT_BATCH64(0);
1213}
1214
1215inline static void
1216gen8_emit_pipe_flush(struct sna *sna, bool need_stall)
1217{
1218	unsigned stall;
1219
1220	stall = 0;
1221	if (need_stall)
1222		stall = (PIPE_CONTROL_CS_STALL |
1223			 PIPE_CONTROL_STALL_AT_SCOREBOARD);
1224
1225	OUT_BATCH(GEN8_PIPE_CONTROL | (6 - 2));
1226	OUT_BATCH(PIPE_CONTROL_WC_FLUSH | stall);
1227	OUT_BATCH64(0);
1228	OUT_BATCH64(0);
1229}
1230
1231inline static void
1232gen8_emit_pipe_stall(struct sna *sna)
1233{
1234	OUT_BATCH(GEN8_PIPE_CONTROL | (6 - 2));
1235	OUT_BATCH(PIPE_CONTROL_CS_STALL |
1236		  PIPE_CONTROL_FLUSH |
1237		  PIPE_CONTROL_STALL_AT_SCOREBOARD);
1238	OUT_BATCH64(0);
1239	OUT_BATCH64(0);
1240}
1241
1242static void
1243gen8_emit_state(struct sna *sna,
1244		const struct sna_composite_op *op,
1245		uint16_t wm_binding_table)
1246{
1247	bool need_invalidate;
1248	bool need_flush;
1249	bool need_stall;
1250
1251	assert(op->dst.bo->exec);
1252
1253	need_flush = wm_binding_table & 1 ||
1254		(sna->render_state.gen8.emit_flush && GEN8_READS_DST(op->u.gen8.flags));
1255	if (ALWAYS_FLUSH)
1256		need_flush = true;
1257
1258	wm_binding_table &= ~1;
1259
1260	need_stall = sna->render_state.gen8.surface_table != wm_binding_table;
1261
1262	need_invalidate = kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo);
1263	if (ALWAYS_INVALIDATE)
1264		need_invalidate = true;
1265
1266	need_stall &= gen8_emit_drawing_rectangle(sna, op);
1267	if (ALWAYS_STALL)
1268		need_stall = true;
1269
1270	if (need_invalidate) {
1271		gen8_emit_pipe_invalidate(sna);
1272		kgem_clear_dirty(&sna->kgem);
1273		assert(op->dst.bo->exec);
1274		kgem_bo_mark_dirty(op->dst.bo);
1275
1276		need_flush = false;
1277		need_stall = false;
1278	}
1279	if (need_flush) {
1280		gen8_emit_pipe_flush(sna, need_stall);
1281		need_stall = false;
1282	}
1283	if (need_stall)
1284		gen8_emit_pipe_stall(sna);
1285
1286	gen8_emit_cc(sna, GEN8_BLEND(op->u.gen8.flags));
1287	gen8_emit_sampler(sna, GEN8_SAMPLER(op->u.gen8.flags));
1288	gen8_emit_sf(sna, GEN8_VERTEX(op->u.gen8.flags) >> 2);
1289	gen8_emit_wm(sna, GEN8_KERNEL(op->u.gen8.flags));
1290	gen8_emit_vertex_elements(sna, op);
1291	gen8_emit_binding_table(sna, wm_binding_table);
1292
1293	sna->render_state.gen8.emit_flush = GEN8_READS_DST(op->u.gen8.flags);
1294}
1295
1296static bool gen8_magic_ca_pass(struct sna *sna,
1297			       const struct sna_composite_op *op)
1298{
1299	struct gen8_render_state *state = &sna->render_state.gen8;
1300
1301	if (!op->need_magic_ca_pass)
1302		return false;
1303
1304	DBG(("%s: CA fixup (%d -> %d)\n", __FUNCTION__,
1305	     sna->render.vertex_start, sna->render.vertex_index));
1306
1307	gen8_emit_pipe_stall(sna);
1308
1309	gen8_emit_cc(sna,
1310		     GEN8_BLEND(gen8_get_blend(PictOpAdd, true,
1311					       op->dst.format)));
1312	gen8_emit_wm(sna,
1313		     gen8_choose_composite_kernel(PictOpAdd,
1314						  true, true,
1315						  op->is_affine));
1316
1317	OUT_BATCH(GEN8_3DPRIMITIVE | (7 - 2));
1318	OUT_BATCH(0); /* ignored, see VF_TOPOLOGY */
1319	OUT_BATCH(sna->render.vertex_index - sna->render.vertex_start);
1320	OUT_BATCH(sna->render.vertex_start);
1321	OUT_BATCH(1);	/* single instance */
1322	OUT_BATCH(0);	/* start instance location */
1323	OUT_BATCH(0);	/* index buffer offset, ignored */
1324
1325	state->last_primitive = sna->kgem.nbatch;
1326	return true;
1327}
1328
1329static void null_create(struct sna_static_stream *stream)
1330{
1331	/* A bunch of zeros useful for legacy border color and depth-stencil */
1332	sna_static_stream_map(stream, 64, 64);
1333}
1334
1335static void
1336sampler_state_init(struct gen8_sampler_state *sampler_state,
1337		   sampler_filter_t filter,
1338		   sampler_extend_t extend)
1339{
1340	COMPILE_TIME_ASSERT(sizeof(*sampler_state) == 4*sizeof(uint32_t));
1341
1342	sampler_state->ss0.lod_preclamp = 2;	/* GL mode */
1343	sampler_state->ss0.default_color_mode = 1;
1344
1345	switch (filter) {
1346	default:
1347	case SAMPLER_FILTER_NEAREST:
1348		sampler_state->ss0.min_filter = MAPFILTER_NEAREST;
1349		sampler_state->ss0.mag_filter = MAPFILTER_NEAREST;
1350		break;
1351	case SAMPLER_FILTER_BILINEAR:
1352		sampler_state->ss0.min_filter = MAPFILTER_LINEAR;
1353		sampler_state->ss0.mag_filter = MAPFILTER_LINEAR;
1354		break;
1355	}
1356
1357	/* XXX bicubic filter using MAPFILTER_FLEXIBLE */
1358
1359	switch (extend) {
1360	default:
1361	case SAMPLER_EXTEND_NONE:
1362		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_CLAMP_BORDER;
1363		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_CLAMP_BORDER;
1364		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_CLAMP_BORDER;
1365		break;
1366	case SAMPLER_EXTEND_REPEAT:
1367		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_WRAP;
1368		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_WRAP;
1369		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_WRAP;
1370		break;
1371	case SAMPLER_EXTEND_PAD:
1372		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_CLAMP;
1373		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_CLAMP;
1374		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_CLAMP;
1375		break;
1376	case SAMPLER_EXTEND_REFLECT:
1377		sampler_state->ss3.r_wrap_mode = TEXCOORDMODE_MIRROR;
1378		sampler_state->ss3.s_wrap_mode = TEXCOORDMODE_MIRROR;
1379		sampler_state->ss3.t_wrap_mode = TEXCOORDMODE_MIRROR;
1380		break;
1381	}
1382}
1383
1384static void
1385sampler_copy_init(struct gen8_sampler_state *ss)
1386{
1387	sampler_state_init(ss, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
1388	ss->ss3.non_normalized_coord = 1;
1389
1390	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
1391}
1392
1393static void
1394sampler_fill_init(struct gen8_sampler_state *ss)
1395{
1396	sampler_state_init(ss, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_REPEAT);
1397	ss->ss3.non_normalized_coord = 1;
1398
1399	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
1400}
1401
1402static uint32_t
1403gen8_tiling_bits(uint32_t tiling)
1404{
1405	switch (tiling) {
1406	default: assert(0);
1407	case I915_TILING_NONE: return 0;
1408	case I915_TILING_X: return SURFACE_TILED;
1409	case I915_TILING_Y: return SURFACE_TILED | SURFACE_TILED_Y;
1410	}
1411}
1412
1413#define MOCS_WT (2 << 5)
1414#define MOCS_WB (3 << 5)
1415#define MOCS_eLLC_ONLY (0 << 3)
1416#define MOCS_LLC_ONLY (1 << 3)
1417#define MOCS_eLLC_LLC (2 << 3)
1418#define MOCS_ALL_CACHES (3 << 3)
1419
1420/**
1421 * Sets up the common fields for a surface state buffer for the given
1422 * picture in the given surface state buffer.
1423 */
1424static uint32_t
1425gen8_bind_bo(struct sna *sna,
1426	     struct kgem_bo *bo,
1427	     uint32_t width,
1428	     uint32_t height,
1429	     uint32_t format,
1430	     bool is_dst)
1431{
1432	uint32_t *ss;
1433	uint32_t domains;
1434	int offset;
1435	uint32_t is_scanout = is_dst && bo->scanout;
1436
1437	/* After the first bind, we manage the cache domains within the batch */
1438	offset = kgem_bo_get_binding(bo, format | is_dst << 30 | is_scanout << 31);
1439	if (offset) {
1440		if (is_dst)
1441			kgem_bo_mark_dirty(bo);
1442		assert(offset >= sna->kgem.surface);
1443		return offset * sizeof(uint32_t);
1444	}
1445
1446	offset = sna->kgem.surface -= SURFACE_DW;
1447	ss = sna->kgem.batch + offset;
1448	ss[0] = (SURFACE_2D << SURFACE_TYPE_SHIFT |
1449		 gen8_tiling_bits(bo->tiling) |
1450		 format << SURFACE_FORMAT_SHIFT |
1451		 SURFACE_VALIGN_4 | SURFACE_HALIGN_4);
1452	if (is_dst) {
1453		ss[0] |= SURFACE_RC_READ_WRITE;
1454		domains = I915_GEM_DOMAIN_RENDER << 16 |I915_GEM_DOMAIN_RENDER;
1455	} else
1456		domains = I915_GEM_DOMAIN_SAMPLER << 16;
1457	ss[1] = (is_dst && is_uncached(sna, bo)) ? 0 : is_scanout ? (MOCS_WT | MOCS_ALL_CACHES) << 24 : (MOCS_WB | MOCS_ALL_CACHES) << 24;
1458	ss[2] = ((width - 1)  << SURFACE_WIDTH_SHIFT |
1459		 (height - 1) << SURFACE_HEIGHT_SHIFT);
1460	ss[3] = (bo->pitch - 1) << SURFACE_PITCH_SHIFT;
1461	ss[4] = 0;
1462	ss[5] = 0;
1463	ss[6] = 0;
1464	ss[7] = SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
1465	*(uint64_t *)(ss+8) = kgem_add_reloc64(&sna->kgem, offset + 8, bo, domains, 0);
1466	ss[10] = 0;
1467	ss[11] = 0;
1468	ss[12] = 0;
1469	ss[13] = 0;
1470	ss[14] = 0;
1471	ss[15] = 0;
1472
1473	kgem_bo_set_binding(bo, format | is_dst << 30 | is_scanout << 31, offset);
1474
1475	DBG(("[%x] bind bo(handle=%d, addr=%lx), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
1476	     offset, bo->handle, *(uint64_t *)(ss+8),
1477	     format, width, height, bo->pitch, bo->tiling,
1478	     domains & 0xffff ? "render" : "sampler"));
1479
1480	return offset * sizeof(uint32_t);
1481}
1482
1483static void gen8_emit_vertex_buffer(struct sna *sna,
1484				    const struct sna_composite_op *op)
1485{
1486	int id = GEN8_VERTEX(op->u.gen8.flags);
1487
1488	OUT_BATCH(GEN8_3DSTATE_VERTEX_BUFFERS | (5 - 2));
1489	OUT_BATCH(id << VB_INDEX_SHIFT | VB_MODIFY_ENABLE |
1490		  4*op->floats_per_vertex);
1491	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
1492	OUT_BATCH64(0);
1493	OUT_BATCH(~0); /* buffer size: disabled */
1494
1495	sna->render.vb_id |= 1 << id;
1496}
1497
1498static void gen8_emit_primitive(struct sna *sna)
1499{
1500	if (sna->kgem.nbatch == sna->render_state.gen8.last_primitive) {
1501		sna->render.vertex_offset = sna->kgem.nbatch - 5;
1502		return;
1503	}
1504
1505	OUT_BATCH(GEN8_3DPRIMITIVE | (7 - 2));
1506	OUT_BATCH(0); /* ignored, see VF_TOPOLOGY */
1507	sna->render.vertex_offset = sna->kgem.nbatch;
1508	OUT_BATCH(0);	/* vertex count, to be filled in later */
1509	OUT_BATCH(sna->render.vertex_index);
1510	OUT_BATCH(1);	/* single instance */
1511	OUT_BATCH(0);	/* start instance location */
1512	OUT_BATCH(0);	/* index buffer offset, ignored */
1513	sna->render.vertex_start = sna->render.vertex_index;
1514
1515	sna->render_state.gen8.last_primitive = sna->kgem.nbatch;
1516}
1517
1518static bool gen8_rectangle_begin(struct sna *sna,
1519				 const struct sna_composite_op *op)
1520{
1521	int id = 1 << GEN8_VERTEX(op->u.gen8.flags);
1522	int ndwords;
1523
1524	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
1525		return true;
1526
1527	ndwords = op->need_magic_ca_pass ? 60 : 6;
1528	if ((sna->render.vb_id & id) == 0)
1529		ndwords += 5;
1530	if (!kgem_check_batch(&sna->kgem, ndwords))
1531		return false;
1532
1533	if ((sna->render.vb_id & id) == 0)
1534		gen8_emit_vertex_buffer(sna, op);
1535
1536	gen8_emit_primitive(sna);
1537	return true;
1538}
1539
1540static int gen8_get_rectangles__flush(struct sna *sna,
1541				      const struct sna_composite_op *op)
1542{
1543	/* Preventing discarding new vbo after lock contention */
1544	if (sna_vertex_wait__locked(&sna->render)) {
1545		int rem = vertex_space(sna);
1546		if (rem > op->floats_per_rect)
1547			return rem;
1548	}
1549
1550	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 65 : 6))
1551		return 0;
1552	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
1553		return 0;
1554
1555	if (sna->render.vertex_offset) {
1556		gen8_vertex_flush(sna);
1557		if (gen8_magic_ca_pass(sna, op)) {
1558			gen8_emit_pipe_invalidate(sna);
1559			gen8_emit_cc(sna, GEN8_BLEND(op->u.gen8.flags));
1560			gen8_emit_wm(sna, GEN8_KERNEL(op->u.gen8.flags));
1561		}
1562	}
1563
1564	return gen8_vertex_finish(sna);
1565}
1566
1567inline static int gen8_get_rectangles(struct sna *sna,
1568				      const struct sna_composite_op *op,
1569				      int want,
1570				      void (*emit_state)(struct sna *sna, const struct sna_composite_op *op))
1571{
1572	int rem;
1573
1574	assert(want);
1575
1576start:
1577	rem = vertex_space(sna);
1578	if (unlikely(rem < op->floats_per_rect)) {
1579		DBG(("flushing vbo for %s: %d < %d\n",
1580		     __FUNCTION__, rem, op->floats_per_rect));
1581		rem = gen8_get_rectangles__flush(sna, op);
1582		if (unlikely(rem == 0))
1583			goto flush;
1584	}
1585
1586	if (unlikely(sna->render.vertex_offset == 0)) {
1587		if (!gen8_rectangle_begin(sna, op))
1588			goto flush;
1589		else
1590			goto start;
1591	}
1592
1593	assert(rem <= vertex_space(sna));
1594	assert(op->floats_per_rect <= rem);
1595	if (want > 1 && want * op->floats_per_rect > rem)
1596		want = rem / op->floats_per_rect;
1597
1598	assert(want > 0);
1599	sna->render.vertex_index += 3*want;
1600	return want;
1601
1602flush:
1603	if (sna->render.vertex_offset) {
1604		gen8_vertex_flush(sna);
1605		gen8_magic_ca_pass(sna, op);
1606	}
1607	sna_vertex_wait__locked(&sna->render);
1608	_kgem_submit(&sna->kgem);
1609	emit_state(sna, op);
1610	goto start;
1611}
1612
1613inline static uint32_t *gen8_composite_get_binding_table(struct sna *sna,
1614							 uint16_t *offset)
1615{
1616	uint32_t *table;
1617
1618	assert(sna->kgem.surface <= 16384);
1619	sna->kgem.surface -= SURFACE_DW;
1620	/* Clear all surplus entries to zero in case of prefetch */
1621	table = memset(sna->kgem.batch + sna->kgem.surface, 0, 64);
1622
1623	DBG(("%s(%x)\n", __FUNCTION__, 4*sna->kgem.surface));
1624
1625	*offset = sna->kgem.surface;
1626	return table;
1627}
1628
1629static void
1630gen8_get_batch(struct sna *sna, const struct sna_composite_op *op)
1631{
1632	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
1633
1634	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 2*(1+3))) {
1635		DBG(("%s: flushing batch: %d < %d+%d\n",
1636		     __FUNCTION__, sna->kgem.surface - sna->kgem.nbatch,
1637		     150, 4*8*2));
1638		_kgem_submit(&sna->kgem);
1639		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
1640	}
1641
1642	assert(sna->kgem.mode == KGEM_RENDER);
1643	assert(sna->kgem.ring == KGEM_RENDER);
1644
1645	if (sna->render_state.gen8.needs_invariant)
1646		gen8_emit_invariant(sna);
1647}
1648
1649static void gen8_emit_composite_state(struct sna *sna,
1650				      const struct sna_composite_op *op)
1651{
1652	uint32_t *binding_table;
1653	uint16_t offset, dirty;
1654
1655	gen8_get_batch(sna, op);
1656
1657	binding_table = gen8_composite_get_binding_table(sna, &offset);
1658
1659	dirty = kgem_bo_is_dirty(op->dst.bo);
1660
1661	binding_table[0] =
1662		gen8_bind_bo(sna,
1663			    op->dst.bo, op->dst.width, op->dst.height,
1664			    gen8_get_dest_format(op->dst.format),
1665			    true);
1666	binding_table[1] =
1667		gen8_bind_bo(sna,
1668			     op->src.bo, op->src.width, op->src.height,
1669			     op->src.card_format,
1670			     false);
1671	if (op->mask.bo) {
1672		binding_table[2] =
1673			gen8_bind_bo(sna,
1674				     op->mask.bo,
1675				     op->mask.width,
1676				     op->mask.height,
1677				     op->mask.card_format,
1678				     false);
1679	}
1680
1681	if (sna->kgem.surface == offset &&
1682	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen8.surface_table) == *(uint64_t*)binding_table &&
1683	    (op->mask.bo == NULL ||
1684	     sna->kgem.batch[sna->render_state.gen8.surface_table+2] == binding_table[2])) {
1685		sna->kgem.surface += SURFACE_DW;
1686		offset = sna->render_state.gen8.surface_table;
1687	}
1688
1689	if (sna->kgem.batch[sna->render_state.gen8.surface_table] == binding_table[0])
1690		dirty = 0;
1691
1692	gen8_emit_state(sna, op, offset | dirty);
1693}
1694
1695static void
1696gen8_align_vertex(struct sna *sna, const struct sna_composite_op *op)
1697{
1698	if (op->floats_per_vertex != sna->render_state.gen8.floats_per_vertex) {
1699		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
1700		     sna->render_state.gen8.floats_per_vertex, op->floats_per_vertex));
1701		gen8_vertex_align(sna, op);
1702		sna->render_state.gen8.floats_per_vertex = op->floats_per_vertex;
1703	}
1704}
1705
1706fastcall static void
1707gen8_render_composite_blt(struct sna *sna,
1708			  const struct sna_composite_op *op,
1709			  const struct sna_composite_rectangles *r)
1710{
1711	gen8_get_rectangles(sna, op, 1, gen8_emit_composite_state);
1712	op->prim_emit(sna, op, r);
1713}
1714
1715fastcall static void
1716gen8_render_composite_box(struct sna *sna,
1717			  const struct sna_composite_op *op,
1718			  const BoxRec *box)
1719{
1720	struct sna_composite_rectangles r;
1721
1722	gen8_get_rectangles(sna, op, 1, gen8_emit_composite_state);
1723
1724	DBG(("  %s: (%d, %d), (%d, %d)\n",
1725	     __FUNCTION__,
1726	     box->x1, box->y1, box->x2, box->y2));
1727
1728	r.dst.x = box->x1;
1729	r.dst.y = box->y1;
1730	r.width  = box->x2 - box->x1;
1731	r.height = box->y2 - box->y1;
1732	r.src = r.mask = r.dst;
1733
1734	op->prim_emit(sna, op, &r);
1735}
1736
1737static void
1738gen8_render_composite_boxes__blt(struct sna *sna,
1739				 const struct sna_composite_op *op,
1740				 const BoxRec *box, int nbox)
1741{
1742	DBG(("composite_boxes(%d)\n", nbox));
1743
1744	do {
1745		int nbox_this_time;
1746
1747		nbox_this_time = gen8_get_rectangles(sna, op, nbox,
1748						     gen8_emit_composite_state);
1749		nbox -= nbox_this_time;
1750
1751		do {
1752			struct sna_composite_rectangles r;
1753
1754			DBG(("  %s: (%d, %d), (%d, %d)\n",
1755			     __FUNCTION__,
1756			     box->x1, box->y1, box->x2, box->y2));
1757
1758			r.dst.x = box->x1;
1759			r.dst.y = box->y1;
1760			r.width  = box->x2 - box->x1;
1761			r.height = box->y2 - box->y1;
1762			r.src = r.mask = r.dst;
1763
1764			op->prim_emit(sna, op, &r);
1765			box++;
1766		} while (--nbox_this_time);
1767	} while (nbox);
1768}
1769
1770static void
1771gen8_render_composite_boxes(struct sna *sna,
1772			    const struct sna_composite_op *op,
1773			    const BoxRec *box, int nbox)
1774{
1775	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1776
1777	do {
1778		int nbox_this_time;
1779		float *v;
1780
1781		nbox_this_time = gen8_get_rectangles(sna, op, nbox,
1782						     gen8_emit_composite_state);
1783		assert(nbox_this_time);
1784		nbox -= nbox_this_time;
1785
1786		v = sna->render.vertices + sna->render.vertex_used;
1787		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1788
1789		op->emit_boxes(op, box, nbox_this_time, v);
1790		box += nbox_this_time;
1791	} while (nbox);
1792}
1793
1794static void
1795gen8_render_composite_boxes__thread(struct sna *sna,
1796				    const struct sna_composite_op *op,
1797				    const BoxRec *box, int nbox)
1798{
1799	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1800
1801	sna_vertex_lock(&sna->render);
1802	do {
1803		int nbox_this_time;
1804		float *v;
1805
1806		nbox_this_time = gen8_get_rectangles(sna, op, nbox,
1807						     gen8_emit_composite_state);
1808		assert(nbox_this_time);
1809		nbox -= nbox_this_time;
1810
1811		v = sna->render.vertices + sna->render.vertex_used;
1812		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1813
1814		sna_vertex_acquire__locked(&sna->render);
1815		sna_vertex_unlock(&sna->render);
1816
1817		op->emit_boxes(op, box, nbox_this_time, v);
1818		box += nbox_this_time;
1819
1820		sna_vertex_lock(&sna->render);
1821		sna_vertex_release__locked(&sna->render);
1822	} while (nbox);
1823	sna_vertex_unlock(&sna->render);
1824}
1825
1826static uint32_t
1827gen8_create_blend_state(struct sna_static_stream *stream)
1828{
1829	char *base, *ptr;
1830	int src, dst;
1831
1832	COMPILE_TIME_ASSERT(((GEN8_BLENDFACTOR_COUNT * GEN8_BLENDFACTOR_COUNT << 4) & (1 << 15)) == 0);
1833
1834	base = sna_static_stream_map(stream,
1835				     GEN8_BLENDFACTOR_COUNT * GEN8_BLENDFACTOR_COUNT * GEN8_BLEND_STATE_PADDED_SIZE,
1836				     64);
1837
1838	ptr = base;
1839	for (src = 0; src < GEN8_BLENDFACTOR_COUNT; src++) {
1840		for (dst = 0; dst < GEN8_BLENDFACTOR_COUNT; dst++) {
1841			struct gen8_blend_state *blend =
1842				(struct gen8_blend_state *)ptr;
1843
1844			assert(((ptr - base) & 63) == 0);
1845			COMPILE_TIME_ASSERT(sizeof(blend->common) == 4);
1846			COMPILE_TIME_ASSERT(sizeof(blend->rt) == 8);
1847			COMPILE_TIME_ASSERT((char *)&blend->rt - (char *)blend == 4);
1848
1849			blend->rt.post_blend_clamp = 1;
1850			blend->rt.pre_blend_clamp = 1;
1851
1852			blend->rt.color_blend =
1853				!(dst == BLENDFACTOR_ZERO && src == BLENDFACTOR_ONE);
1854			blend->rt.dest_blend_factor = dst;
1855			blend->rt.source_blend_factor = src;
1856			blend->rt.color_blend_function = BLENDFUNCTION_ADD;
1857
1858			blend->rt.dest_alpha_blend_factor = dst;
1859			blend->rt.source_alpha_blend_factor = src;
1860			blend->rt.alpha_blend_function = BLENDFUNCTION_ADD;
1861
1862			ptr += GEN8_BLEND_STATE_PADDED_SIZE;
1863		}
1864	}
1865
1866	return sna_static_stream_offsetof(stream, base);
1867}
1868
1869static int
1870gen8_composite_picture(struct sna *sna,
1871		       PicturePtr picture,
1872		       struct sna_composite_channel *channel,
1873		       int x, int y,
1874		       int w, int h,
1875		       int dst_x, int dst_y,
1876		       bool precise)
1877{
1878	PixmapPtr pixmap;
1879	uint32_t color;
1880	int16_t dx, dy;
1881
1882	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
1883	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
1884
1885	channel->is_solid = false;
1886	channel->card_format = -1;
1887
1888	if (sna_picture_is_solid(picture, &color))
1889		return gen4_channel_init_solid(sna, channel, color);
1890
1891	if (picture->pDrawable == NULL) {
1892		int ret;
1893
1894		if (picture->pSourcePict->type == SourcePictTypeLinear)
1895			return gen4_channel_init_linear(sna, picture, channel,
1896							x, y,
1897							w, h,
1898							dst_x, dst_y);
1899
1900		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
1901		ret = -1;
1902		if (!precise)
1903			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
1904								      x, y, w, h, dst_x, dst_y);
1905		if (ret == -1)
1906			ret = sna_render_picture_fixup(sna, picture, channel,
1907						       x, y, w, h, dst_x, dst_y);
1908		return ret;
1909	}
1910
1911	if (picture->alphaMap) {
1912		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
1913		return sna_render_picture_fixup(sna, picture, channel,
1914						x, y, w, h, dst_x, dst_y);
1915	}
1916
1917	if (!gen8_check_repeat(picture))
1918		return sna_render_picture_fixup(sna, picture, channel,
1919						x, y, w, h, dst_x, dst_y);
1920
1921	if (!gen8_check_filter(picture))
1922		return sna_render_picture_fixup(sna, picture, channel,
1923						x, y, w, h, dst_x, dst_y);
1924
1925	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
1926	channel->filter = picture->filter;
1927
1928	pixmap = get_drawable_pixmap(picture->pDrawable);
1929	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
1930
1931	x += dx + picture->pDrawable->x;
1932	y += dy + picture->pDrawable->y;
1933
1934	channel->is_affine = sna_transform_is_affine(picture->transform);
1935	if (sna_transform_is_imprecise_integer_translation(picture->transform, picture->filter, precise, &dx, &dy)) {
1936		DBG(("%s: integer translation (%d, %d), removing\n",
1937		     __FUNCTION__, dx, dy));
1938		x += dx;
1939		y += dy;
1940		channel->transform = NULL;
1941		channel->filter = PictFilterNearest;
1942
1943		if (channel->repeat ||
1944		    (x >= 0 &&
1945		     y >= 0 &&
1946		     x + w <= pixmap->drawable.width &&
1947		     y + h <= pixmap->drawable.height)) {
1948			struct sna_pixmap *priv = sna_pixmap(pixmap);
1949			if (priv && priv->clear) {
1950				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
1951				return gen4_channel_init_solid(sna, channel, solid_color(picture->format, priv->clear_color));
1952			}
1953		}
1954	} else
1955		channel->transform = picture->transform;
1956
1957	channel->pict_format = picture->format;
1958	channel->card_format = gen8_get_card_format(picture->format);
1959	if (channel->card_format == (unsigned)-1)
1960		return sna_render_picture_convert(sna, picture, channel, pixmap,
1961						  x, y, w, h, dst_x, dst_y,
1962						  false);
1963
1964	if (too_large(pixmap->drawable.width, pixmap->drawable.height)) {
1965		DBG(("%s: extracting from pixmap %dx%d\n", __FUNCTION__,
1966		     pixmap->drawable.width, pixmap->drawable.height));
1967		return sna_render_picture_extract(sna, picture, channel,
1968						  x, y, w, h, dst_x, dst_y);
1969	}
1970
1971	return sna_render_pixmap_bo(sna, channel, pixmap,
1972				    x, y, w, h, dst_x, dst_y);
1973}
1974
1975inline static bool gen8_composite_channel_convert(struct sna_composite_channel *channel)
1976{
1977	if (unaligned(channel->bo, PICT_FORMAT_BPP(channel->pict_format)))
1978		return false;
1979
1980	channel->repeat = gen8_repeat(channel->repeat);
1981	channel->filter = gen8_filter(channel->filter);
1982	if (channel->card_format == (unsigned)-1)
1983		channel->card_format = gen8_get_card_format(channel->pict_format);
1984	assert(channel->card_format != (unsigned)-1);
1985
1986	return true;
1987}
1988
1989static void gen8_render_composite_done(struct sna *sna,
1990				       const struct sna_composite_op *op)
1991{
1992	if (sna->render.vertex_offset) {
1993		gen8_vertex_flush(sna);
1994		gen8_magic_ca_pass(sna, op);
1995	}
1996
1997	if (op->mask.bo)
1998		kgem_bo_destroy(&sna->kgem, op->mask.bo);
1999	if (op->src.bo)
2000		kgem_bo_destroy(&sna->kgem, op->src.bo);
2001
2002	sna_render_composite_redirect_done(sna, op);
2003}
2004
2005inline static bool
2006gen8_composite_set_target(struct sna *sna,
2007			  struct sna_composite_op *op,
2008			  PicturePtr dst,
2009			  int x, int y, int w, int h,
2010			  bool partial)
2011{
2012	BoxRec box;
2013	unsigned int hint;
2014
2015	DBG(("%s: (%d, %d)x(%d, %d), partial?=%d\n", __FUNCTION__, x, y, w, h, partial));
2016
2017	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
2018	op->dst.format = dst->format;
2019	op->dst.width  = op->dst.pixmap->drawable.width;
2020	op->dst.height = op->dst.pixmap->drawable.height;
2021
2022	if (w | h) {
2023		assert(w && h);
2024		box.x1 = x;
2025		box.y1 = y;
2026		box.x2 = x + w;
2027		box.y2 = y + h;
2028	} else
2029		sna_render_picture_extents(dst, &box);
2030
2031	hint = PREFER_GPU | RENDER_GPU;
2032	if (!need_tiling(sna, op->dst.width, op->dst.height))
2033		hint |= FORCE_GPU;
2034	if (!partial) {
2035		hint |= IGNORE_DAMAGE;
2036		if (w == op->dst.width && h == op->dst.height)
2037			hint |= REPLACES;
2038	}
2039
2040	op->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint, &box, &op->damage);
2041	if (op->dst.bo == NULL)
2042		return false;
2043
2044	if (unaligned(op->dst.bo, dst->pDrawable->bitsPerPixel))
2045		return false;
2046
2047	if (hint & REPLACES) {
2048		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
2049		kgem_bo_pair_undo(&sna->kgem, priv->gpu_bo, priv->cpu_bo);
2050	}
2051
2052	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
2053			    &op->dst.x, &op->dst.y);
2054
2055	DBG(("%s: pixmap=%ld, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
2056	     __FUNCTION__,
2057	     op->dst.pixmap->drawable.serialNumber, (int)op->dst.format,
2058	     op->dst.width, op->dst.height,
2059	     op->dst.bo->pitch,
2060	     op->dst.x, op->dst.y,
2061	     op->damage ? *op->damage : (void *)-1));
2062
2063	assert(op->dst.bo->proxy == NULL);
2064
2065	if (too_large(op->dst.width, op->dst.height) &&
2066	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
2067		return false;
2068
2069	return true;
2070}
2071
2072static bool
2073try_blt(struct sna *sna,
2074	uint8_t op,
2075	PicturePtr src,
2076	PicturePtr mask,
2077	PicturePtr dst,
2078	int16_t src_x, int16_t src_y,
2079	int16_t msk_x, int16_t msk_y,
2080	int16_t dst_x, int16_t dst_y,
2081	int16_t width, int16_t height,
2082	unsigned flags,
2083	struct sna_composite_op *tmp)
2084{
2085	struct kgem_bo *bo;
2086
2087	if (sna->kgem.mode == KGEM_BLT) {
2088		DBG(("%s: already performing BLT\n", __FUNCTION__));
2089		goto execute;
2090	}
2091
2092	if (too_large(width, height)) {
2093		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
2094		     __FUNCTION__, width, height));
2095		goto execute;
2096	}
2097
2098	bo = __sna_drawable_peek_bo(dst->pDrawable);
2099	if (bo == NULL)
2100		goto execute;
2101
2102	if (untiled_tlb_miss(bo))
2103		goto execute;
2104
2105	if (bo->rq) {
2106		if (RQ_IS_BLT(bo->rq))
2107			goto execute;
2108
2109		return false;
2110	}
2111
2112	if (bo->tiling == I915_TILING_Y)
2113		goto upload;
2114
2115	if (sna_picture_is_solid(src, NULL) && can_switch_to_blt(sna, bo, 0))
2116		goto execute;
2117
2118	if (src->pDrawable == dst->pDrawable &&
2119	    (sna->render_state.gt < 3 || width*height < 1024) &&
2120	    can_switch_to_blt(sna, bo, 0))
2121		goto execute;
2122
2123	if (src->pDrawable) {
2124		struct kgem_bo *s = __sna_drawable_peek_bo(src->pDrawable);
2125		if (s == NULL)
2126			goto upload;
2127
2128		if (prefer_blt_bo(sna, s, bo))
2129			goto execute;
2130	}
2131
2132	if (sna->kgem.ring == KGEM_BLT) {
2133		DBG(("%s: already performing BLT\n", __FUNCTION__));
2134		goto execute;
2135	}
2136
2137upload:
2138	flags |= COMPOSITE_UPLOAD;
2139execute:
2140	return sna_blt_composite(sna, op,
2141				 src, dst,
2142				 src_x, src_y,
2143				 dst_x, dst_y,
2144				 width, height,
2145				 flags, tmp);
2146}
2147
2148static bool
2149check_gradient(PicturePtr picture, bool precise)
2150{
2151	if (picture->pDrawable)
2152		return false;
2153
2154	switch (picture->pSourcePict->type) {
2155	case SourcePictTypeSolidFill:
2156	case SourcePictTypeLinear:
2157		return false;
2158	default:
2159		return precise;
2160	}
2161}
2162
2163static bool
2164has_alphamap(PicturePtr p)
2165{
2166	return p->alphaMap != NULL;
2167}
2168
2169static bool
2170need_upload(PicturePtr p)
2171{
2172	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
2173}
2174
2175static bool
2176source_is_busy(PixmapPtr pixmap)
2177{
2178	struct sna_pixmap *priv = sna_pixmap(pixmap);
2179	if (priv == NULL || priv->clear)
2180		return false;
2181
2182	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
2183		return true;
2184
2185	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
2186		return true;
2187
2188	return priv->gpu_damage && !priv->cpu_damage;
2189}
2190
2191static bool
2192source_fallback(PicturePtr p, PixmapPtr pixmap, bool precise)
2193{
2194	if (sna_picture_is_solid(p, NULL))
2195		return false;
2196
2197	if (p->pSourcePict)
2198		return check_gradient(p, precise);
2199
2200	if (!gen8_check_repeat(p) || !gen8_check_format(p->format))
2201		return true;
2202
2203	if (pixmap && source_is_busy(pixmap))
2204		return false;
2205
2206	return has_alphamap(p) || !gen8_check_filter(p) || need_upload(p);
2207}
2208
2209static bool
2210gen8_composite_fallback(struct sna *sna,
2211			PicturePtr src,
2212			PicturePtr mask,
2213			PicturePtr dst)
2214{
2215	PixmapPtr src_pixmap;
2216	PixmapPtr mask_pixmap;
2217	PixmapPtr dst_pixmap;
2218	bool src_fallback, mask_fallback;
2219
2220	if (!gen8_check_dst_format(dst->format)) {
2221		DBG(("%s: unknown destination format: %d\n",
2222		     __FUNCTION__, dst->format));
2223		return true;
2224	}
2225
2226	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
2227
2228	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
2229	src_fallback = source_fallback(src, src_pixmap,
2230				       dst->polyMode == PolyModePrecise);
2231
2232	if (mask) {
2233		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
2234		mask_fallback = source_fallback(mask, mask_pixmap,
2235						dst->polyMode == PolyModePrecise);
2236	} else {
2237		mask_pixmap = NULL;
2238		mask_fallback = false;
2239	}
2240
2241	/* If we are using the destination as a source and need to
2242	 * readback in order to upload the source, do it all
2243	 * on the cpu.
2244	 */
2245	if (src_pixmap == dst_pixmap && src_fallback) {
2246		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
2247		return true;
2248	}
2249	if (mask_pixmap == dst_pixmap && mask_fallback) {
2250		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
2251		return true;
2252	}
2253
2254	/* If anything is on the GPU, push everything out to the GPU */
2255	if (dst_use_gpu(dst_pixmap)) {
2256		DBG(("%s: dst is already on the GPU, try to use GPU\n",
2257		     __FUNCTION__));
2258		return false;
2259	}
2260
2261	if (src_pixmap && !src_fallback) {
2262		DBG(("%s: src is already on the GPU, try to use GPU\n",
2263		     __FUNCTION__));
2264		return false;
2265	}
2266	if (mask_pixmap && !mask_fallback) {
2267		DBG(("%s: mask is already on the GPU, try to use GPU\n",
2268		     __FUNCTION__));
2269		return false;
2270	}
2271
2272	/* However if the dst is not on the GPU and we need to
2273	 * render one of the sources using the CPU, we may
2274	 * as well do the entire operation in place onthe CPU.
2275	 */
2276	if (src_fallback) {
2277		DBG(("%s: dst is on the CPU and src will fallback\n",
2278		     __FUNCTION__));
2279		return true;
2280	}
2281
2282	if (mask && mask_fallback) {
2283		DBG(("%s: dst is on the CPU and mask will fallback\n",
2284		     __FUNCTION__));
2285		return true;
2286	}
2287
2288	if (too_large(dst_pixmap->drawable.width,
2289		      dst_pixmap->drawable.height) &&
2290	    dst_is_cpu(dst_pixmap)) {
2291		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
2292		return true;
2293	}
2294
2295	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
2296	     __FUNCTION__));
2297	return dst_use_cpu(dst_pixmap);
2298}
2299
2300static int
2301reuse_source(struct sna *sna,
2302	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
2303	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
2304{
2305	uint32_t color;
2306
2307	if (src_x != msk_x || src_y != msk_y)
2308		return false;
2309
2310	if (src == mask) {
2311		DBG(("%s: mask is source\n", __FUNCTION__));
2312		*mc = *sc;
2313		mc->bo = kgem_bo_reference(mc->bo);
2314		return true;
2315	}
2316
2317	if (sna_picture_is_solid(mask, &color))
2318		return gen4_channel_init_solid(sna, mc, color);
2319
2320	if (sc->is_solid)
2321		return false;
2322
2323	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
2324		return false;
2325
2326	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
2327
2328	if (!sna_transform_equal(src->transform, mask->transform))
2329		return false;
2330
2331	if (!sna_picture_alphamap_equal(src, mask))
2332		return false;
2333
2334	if (!gen8_check_repeat(mask))
2335		return false;
2336
2337	if (!gen8_check_filter(mask))
2338		return false;
2339
2340	if (!gen8_check_format(mask->format))
2341		return false;
2342
2343	DBG(("%s: reusing source channel for mask with a twist\n",
2344	     __FUNCTION__));
2345
2346	*mc = *sc;
2347	mc->repeat = gen8_repeat(mask->repeat ? mask->repeatType : RepeatNone);
2348	mc->filter = gen8_filter(mask->filter);
2349	mc->pict_format = mask->format;
2350	mc->card_format = gen8_get_card_format(mask->format);
2351	mc->bo = kgem_bo_reference(mc->bo);
2352	return true;
2353}
2354
2355static bool
2356gen8_render_composite(struct sna *sna,
2357		      uint8_t op,
2358		      PicturePtr src,
2359		      PicturePtr mask,
2360		      PicturePtr dst,
2361		      int16_t src_x, int16_t src_y,
2362		      int16_t msk_x, int16_t msk_y,
2363		      int16_t dst_x, int16_t dst_y,
2364		      int16_t width, int16_t height,
2365		      unsigned flags,
2366		      struct sna_composite_op *tmp)
2367{
2368	if (op >= ARRAY_SIZE(gen8_blend_op))
2369		return false;
2370
2371	DBG(("%s: %dx%d, current mode=%d/%d\n", __FUNCTION__,
2372	     width, height, sna->kgem.mode, sna->kgem.ring));
2373
2374	if (mask == NULL &&
2375	    try_blt(sna, op,
2376		    src, mask, dst,
2377		    src_x, src_y,
2378		    msk_x, msk_y,
2379		    dst_x, dst_y,
2380		    width, height,
2381		    flags, tmp))
2382		return true;
2383
2384	if (gen8_composite_fallback(sna, src, mask, dst))
2385		goto fallback;
2386
2387	if (need_tiling(sna, width, height))
2388		return sna_tiling_composite(op, src, mask, dst,
2389					    src_x, src_y,
2390					    msk_x, msk_y,
2391					    dst_x, dst_y,
2392					    width, height,
2393					    tmp);
2394
2395	if (op == PictOpClear && src == sna->clear)
2396		op = PictOpSrc;
2397	tmp->op = op;
2398	if (!gen8_composite_set_target(sna, tmp, dst,
2399				       dst_x, dst_y, width, height,
2400				       flags & COMPOSITE_PARTIAL || op > PictOpSrc))
2401		goto fallback;
2402
2403	switch (gen8_composite_picture(sna, src, &tmp->src,
2404				       src_x, src_y,
2405				       width, height,
2406				       dst_x, dst_y,
2407				       dst->polyMode == PolyModePrecise)) {
2408	case -1:
2409		goto cleanup_dst;
2410	case 0:
2411		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
2412			goto cleanup_dst;
2413		/* fall through to fixup */
2414	case 1:
2415		/* Did we just switch rings to prepare the source? */
2416		if (mask == NULL &&
2417		    (prefer_blt_composite(sna, tmp) ||
2418		     unaligned(tmp->src.bo, PICT_FORMAT_BPP(tmp->src.pict_format))) &&
2419		    sna_blt_composite__convert(sna,
2420					       dst_x, dst_y, width, height,
2421					       tmp))
2422			return true;
2423
2424		if (!gen8_composite_channel_convert(&tmp->src))
2425			goto cleanup_src;
2426
2427		break;
2428	}
2429
2430	tmp->is_affine = tmp->src.is_affine;
2431	tmp->has_component_alpha = false;
2432	tmp->need_magic_ca_pass = false;
2433
2434	tmp->mask.bo = NULL;
2435	tmp->mask.filter = SAMPLER_FILTER_NEAREST;
2436	tmp->mask.repeat = SAMPLER_EXTEND_NONE;
2437
2438	if (mask) {
2439		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
2440			tmp->has_component_alpha = true;
2441
2442			/* Check if it's component alpha that relies on a source alpha and on
2443			 * the source value.  We can only get one of those into the single
2444			 * source value that we get to blend with.
2445			 */
2446			if (gen8_blend_op[op].src_alpha &&
2447			    (gen8_blend_op[op].src_blend != BLENDFACTOR_ZERO)) {
2448				if (op != PictOpOver)
2449					goto cleanup_src;
2450
2451				tmp->need_magic_ca_pass = true;
2452				tmp->op = PictOpOutReverse;
2453			}
2454		}
2455
2456		if (!reuse_source(sna,
2457				  src, &tmp->src, src_x, src_y,
2458				  mask, &tmp->mask, msk_x, msk_y)) {
2459			switch (gen8_composite_picture(sna, mask, &tmp->mask,
2460						       msk_x, msk_y,
2461						       width, height,
2462						       dst_x, dst_y,
2463						       dst->polyMode == PolyModePrecise)) {
2464			case -1:
2465				goto cleanup_src;
2466			case 0:
2467				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
2468					goto cleanup_src;
2469				/* fall through to fixup */
2470			case 1:
2471				if (!gen8_composite_channel_convert(&tmp->mask))
2472					goto cleanup_mask;
2473				break;
2474			}
2475		}
2476
2477		tmp->is_affine &= tmp->mask.is_affine;
2478	}
2479
2480	tmp->u.gen8.flags =
2481		GEN8_SET_FLAGS(SAMPLER_OFFSET(tmp->src.filter,
2482					      tmp->src.repeat,
2483					      tmp->mask.filter,
2484					      tmp->mask.repeat),
2485			       gen8_get_blend(tmp->op,
2486					      tmp->has_component_alpha,
2487					      tmp->dst.format),
2488			       gen8_choose_composite_kernel(tmp->op,
2489							    tmp->mask.bo != NULL,
2490							    tmp->has_component_alpha,
2491							    tmp->is_affine),
2492			       gen4_choose_composite_emitter(sna, tmp));
2493
2494	tmp->blt   = gen8_render_composite_blt;
2495	tmp->box   = gen8_render_composite_box;
2496	tmp->boxes = gen8_render_composite_boxes__blt;
2497	if (tmp->emit_boxes){
2498		tmp->boxes = gen8_render_composite_boxes;
2499		tmp->thread_boxes = gen8_render_composite_boxes__thread;
2500	}
2501	tmp->done  = gen8_render_composite_done;
2502
2503	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
2504	if (!kgem_check_bo(&sna->kgem,
2505			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2506			   NULL)) {
2507		kgem_submit(&sna->kgem);
2508		if (!kgem_check_bo(&sna->kgem,
2509				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2510				   NULL))
2511			goto cleanup_mask;
2512		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
2513	}
2514
2515	gen8_align_vertex(sna, tmp);
2516	gen8_emit_composite_state(sna, tmp);
2517	return true;
2518
2519cleanup_mask:
2520	if (tmp->mask.bo) {
2521		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
2522		tmp->mask.bo = NULL;
2523	}
2524cleanup_src:
2525	if (tmp->src.bo) {
2526		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
2527		tmp->src.bo = NULL;
2528	}
2529cleanup_dst:
2530	if (tmp->redirect.real_bo) {
2531		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
2532		tmp->redirect.real_bo = NULL;
2533	}
2534fallback:
2535	return (mask == NULL &&
2536		sna_blt_composite(sna, op,
2537				  src, dst,
2538				  src_x, src_y,
2539				  dst_x, dst_y,
2540				  width, height,
2541				  flags | COMPOSITE_FALLBACK, tmp));
2542}
2543
2544#if !NO_COMPOSITE_SPANS
2545fastcall static void
2546gen8_render_composite_spans_box(struct sna *sna,
2547				const struct sna_composite_spans_op *op,
2548				const BoxRec *box, float opacity)
2549{
2550	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
2551	     __FUNCTION__,
2552	     op->base.src.offset[0], op->base.src.offset[1],
2553	     opacity,
2554	     op->base.dst.x, op->base.dst.y,
2555	     box->x1, box->y1,
2556	     box->x2 - box->x1,
2557	     box->y2 - box->y1));
2558
2559	gen8_get_rectangles(sna, &op->base, 1, gen8_emit_composite_state);
2560	op->prim_emit(sna, op, box, opacity);
2561}
2562
2563static void
2564gen8_render_composite_spans_boxes(struct sna *sna,
2565				  const struct sna_composite_spans_op *op,
2566				  const BoxRec *box, int nbox,
2567				  float opacity)
2568{
2569	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
2570	     __FUNCTION__, nbox,
2571	     op->base.src.offset[0], op->base.src.offset[1],
2572	     opacity,
2573	     op->base.dst.x, op->base.dst.y));
2574
2575	do {
2576		int nbox_this_time;
2577
2578		nbox_this_time = gen8_get_rectangles(sna, &op->base, nbox,
2579						     gen8_emit_composite_state);
2580		nbox -= nbox_this_time;
2581
2582		do {
2583			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
2584			     box->x1, box->y1,
2585			     box->x2 - box->x1,
2586			     box->y2 - box->y1));
2587
2588			op->prim_emit(sna, op, box++, opacity);
2589		} while (--nbox_this_time);
2590	} while (nbox);
2591}
2592
2593fastcall static void
2594gen8_render_composite_spans_boxes__thread(struct sna *sna,
2595					  const struct sna_composite_spans_op *op,
2596					  const struct sna_opacity_box *box,
2597					  int nbox)
2598{
2599	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
2600	     __FUNCTION__, nbox,
2601	     op->base.src.offset[0], op->base.src.offset[1],
2602	     op->base.dst.x, op->base.dst.y));
2603
2604	sna_vertex_lock(&sna->render);
2605	do {
2606		int nbox_this_time;
2607		float *v;
2608
2609		nbox_this_time = gen8_get_rectangles(sna, &op->base, nbox,
2610						     gen8_emit_composite_state);
2611		assert(nbox_this_time);
2612		nbox -= nbox_this_time;
2613
2614		v = sna->render.vertices + sna->render.vertex_used;
2615		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
2616
2617		sna_vertex_acquire__locked(&sna->render);
2618		sna_vertex_unlock(&sna->render);
2619
2620		op->emit_boxes(op, box, nbox_this_time, v);
2621		box += nbox_this_time;
2622
2623		sna_vertex_lock(&sna->render);
2624		sna_vertex_release__locked(&sna->render);
2625	} while (nbox);
2626	sna_vertex_unlock(&sna->render);
2627}
2628
2629fastcall static void
2630gen8_render_composite_spans_done(struct sna *sna,
2631				 const struct sna_composite_spans_op *op)
2632{
2633	if (sna->render.vertex_offset)
2634		gen8_vertex_flush(sna);
2635
2636	DBG(("%s()\n", __FUNCTION__));
2637
2638	if (op->base.src.bo)
2639		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2640
2641	sna_render_composite_redirect_done(sna, &op->base);
2642}
2643
2644static bool
2645gen8_check_composite_spans(struct sna *sna,
2646			   uint8_t op, PicturePtr src, PicturePtr dst,
2647			   int16_t width, int16_t height, unsigned flags)
2648{
2649	if (op >= ARRAY_SIZE(gen8_blend_op))
2650		return false;
2651
2652	if (gen8_composite_fallback(sna, src, NULL, dst))
2653		return false;
2654
2655	if (need_tiling(sna, width, height) &&
2656	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
2657		DBG(("%s: fallback, tiled operation not on GPU\n",
2658		     __FUNCTION__));
2659		return false;
2660	}
2661
2662	return true;
2663}
2664
2665static bool
2666gen8_render_composite_spans(struct sna *sna,
2667			    uint8_t op,
2668			    PicturePtr src,
2669			    PicturePtr dst,
2670			    int16_t src_x,  int16_t src_y,
2671			    int16_t dst_x,  int16_t dst_y,
2672			    int16_t width,  int16_t height,
2673			    unsigned flags,
2674			    struct sna_composite_spans_op *tmp)
2675{
2676	DBG(("%s: %dx%d with flags=%x, current mode=%d\n", __FUNCTION__,
2677	     width, height, flags, sna->kgem.ring));
2678
2679	assert(gen8_check_composite_spans(sna, op, src, dst, width, height, flags));
2680
2681	if (need_tiling(sna, width, height)) {
2682		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
2683		     __FUNCTION__, width, height));
2684		return sna_tiling_composite_spans(op, src, dst,
2685						  src_x, src_y, dst_x, dst_y,
2686						  width, height, flags, tmp);
2687	}
2688
2689	tmp->base.op = op;
2690	if (!gen8_composite_set_target(sna, &tmp->base, dst,
2691				       dst_x, dst_y, width, height, true))
2692		return false;
2693
2694	switch (gen8_composite_picture(sna, src, &tmp->base.src,
2695				       src_x, src_y,
2696				       width, height,
2697				       dst_x, dst_y,
2698				       dst->polyMode == PolyModePrecise)) {
2699	case -1:
2700		goto cleanup_dst;
2701	case 0:
2702		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
2703			goto cleanup_dst;
2704		/* fall through to fixup */
2705	case 1:
2706		if (!gen8_composite_channel_convert(&tmp->base.src))
2707			goto cleanup_src;
2708		break;
2709	}
2710	tmp->base.mask.bo = NULL;
2711
2712	tmp->base.is_affine = tmp->base.src.is_affine;
2713	tmp->base.need_magic_ca_pass = false;
2714
2715	tmp->base.u.gen8.flags =
2716		GEN8_SET_FLAGS(SAMPLER_OFFSET(tmp->base.src.filter,
2717					      tmp->base.src.repeat,
2718					      SAMPLER_FILTER_NEAREST,
2719					      SAMPLER_EXTEND_PAD),
2720			       gen8_get_blend(tmp->base.op, false, tmp->base.dst.format),
2721			       GEN8_WM_KERNEL_OPACITY | !tmp->base.is_affine,
2722			       gen4_choose_spans_emitter(sna, tmp));
2723
2724	tmp->box   = gen8_render_composite_spans_box;
2725	tmp->boxes = gen8_render_composite_spans_boxes;
2726	if (tmp->emit_boxes)
2727		tmp->thread_boxes = gen8_render_composite_spans_boxes__thread;
2728	tmp->done  = gen8_render_composite_spans_done;
2729
2730	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->base.dst.bo);
2731	if (!kgem_check_bo(&sna->kgem,
2732			   tmp->base.dst.bo, tmp->base.src.bo,
2733			   NULL)) {
2734		kgem_submit(&sna->kgem);
2735		if (!kgem_check_bo(&sna->kgem,
2736				   tmp->base.dst.bo, tmp->base.src.bo,
2737				   NULL))
2738			goto cleanup_src;
2739		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
2740	}
2741
2742	gen8_align_vertex(sna, &tmp->base);
2743	gen8_emit_composite_state(sna, &tmp->base);
2744	return true;
2745
2746cleanup_src:
2747	if (tmp->base.src.bo)
2748		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
2749cleanup_dst:
2750	if (tmp->base.redirect.real_bo)
2751		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
2752	return false;
2753}
2754#endif
2755
2756static void
2757gen8_emit_copy_state(struct sna *sna,
2758		     const struct sna_composite_op *op)
2759{
2760	uint32_t *binding_table;
2761	uint16_t offset, dirty;
2762
2763	gen8_get_batch(sna, op);
2764
2765	binding_table = gen8_composite_get_binding_table(sna, &offset);
2766
2767	dirty = kgem_bo_is_dirty(op->dst.bo);
2768
2769	binding_table[0] =
2770		gen8_bind_bo(sna,
2771			     op->dst.bo, op->dst.width, op->dst.height,
2772			     gen8_get_dest_format(op->dst.format),
2773			     true);
2774	binding_table[1] =
2775		gen8_bind_bo(sna,
2776			     op->src.bo, op->src.width, op->src.height,
2777			     op->src.card_format,
2778			     false);
2779
2780	if (sna->kgem.surface == offset &&
2781	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen8.surface_table) == *(uint64_t*)binding_table) {
2782		sna->kgem.surface += SURFACE_DW;
2783		offset = sna->render_state.gen8.surface_table;
2784	}
2785
2786	if (sna->kgem.batch[sna->render_state.gen8.surface_table] == binding_table[0])
2787		dirty = 0;
2788
2789	assert(!GEN8_READS_DST(op->u.gen8.flags));
2790	gen8_emit_state(sna, op, offset | dirty);
2791}
2792
2793static inline bool
2794prefer_blt_copy(struct sna *sna,
2795		struct kgem_bo *src_bo,
2796		struct kgem_bo *dst_bo,
2797		unsigned flags)
2798{
2799	if (sna->kgem.mode == KGEM_BLT)
2800		return true;
2801
2802	assert((flags & COPY_SYNC) == 0);
2803
2804	if (untiled_tlb_miss(src_bo) ||
2805	    untiled_tlb_miss(dst_bo))
2806		return true;
2807
2808	if (flags & COPY_DRI && !sna->kgem.has_semaphores)
2809		return false;
2810
2811	if (force_blt_ring(sna, dst_bo))
2812		return true;
2813
2814	if ((flags & COPY_SMALL ||
2815	     (sna->render_state.gt < 3 && src_bo == dst_bo)) &&
2816	    can_switch_to_blt(sna, dst_bo, flags))
2817		return true;
2818
2819	if (kgem_bo_is_render(dst_bo) ||
2820	    kgem_bo_is_render(src_bo))
2821		return false;
2822
2823	if (flags & COPY_LAST &&
2824	    sna->render_state.gt < 3 &&
2825            can_switch_to_blt(sna, dst_bo, flags))
2826		return true;
2827
2828	if (prefer_render_ring(sna, dst_bo))
2829		return false;
2830
2831	if (!prefer_blt_ring(sna, dst_bo, flags))
2832		return false;
2833
2834	return prefer_blt_bo(sna, src_bo, dst_bo);
2835}
2836
2837static bool
2838gen8_render_copy_boxes(struct sna *sna, uint8_t alu,
2839		       const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
2840		       const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
2841		       const BoxRec *box, int n, unsigned flags)
2842{
2843	struct sna_composite_op tmp;
2844	BoxRec extents;
2845
2846	DBG(("%s (%d, %d)->(%d, %d) x %d, alu=%x, flags=%x, self-copy=%d, overlaps? %d\n",
2847	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n, alu, flags,
2848	     src_bo == dst_bo,
2849	     overlaps(sna,
2850		      src_bo, src_dx, src_dy,
2851		      dst_bo, dst_dx, dst_dy,
2852		      box, n, flags, &extents)));
2853
2854	if (prefer_blt_copy(sna, src_bo, dst_bo, flags) &&
2855	    sna_blt_compare_depth(src, dst) &&
2856	    sna_blt_copy_boxes(sna, alu,
2857			       src_bo, src_dx, src_dy,
2858			       dst_bo, dst_dx, dst_dy,
2859			       dst->bitsPerPixel,
2860			       box, n))
2861		return true;
2862
2863	if (!(alu == GXcopy || alu == GXclear) ||
2864	    unaligned(src_bo, src->bitsPerPixel) ||
2865	    unaligned(dst_bo, dst->bitsPerPixel)) {
2866fallback_blt:
2867		DBG(("%s: fallback blt\n", __FUNCTION__));
2868		if (!sna_blt_compare_depth(src, dst))
2869			return false;
2870
2871		return sna_blt_copy_boxes_fallback(sna, alu,
2872						   src, src_bo, src_dx, src_dy,
2873						   dst, dst_bo, dst_dx, dst_dy,
2874						   box, n);
2875	}
2876
2877	if (overlaps(sna,
2878		     src_bo, src_dx, src_dy,
2879		     dst_bo, dst_dx, dst_dy,
2880		     box, n, flags,
2881		     &extents)) {
2882		bool big = too_large(extents.x2-extents.x1, extents.y2-extents.y1);
2883
2884		if ((big || !prefer_render_ring(sna, dst_bo)) &&
2885		    sna_blt_copy_boxes(sna, alu,
2886				       src_bo, src_dx, src_dy,
2887				       dst_bo, dst_dx, dst_dy,
2888				       dst->bitsPerPixel,
2889				       box, n))
2890			return true;
2891
2892		if (big)
2893			goto fallback_blt;
2894
2895		assert(src_bo == dst_bo);
2896		assert(src->depth == dst->depth);
2897		assert(src->width == dst->width);
2898		assert(src->height == dst->height);
2899		return sna_render_copy_boxes__overlap(sna, alu, dst, dst_bo,
2900						      src_dx, src_dy,
2901						      dst_dx, dst_dy,
2902						      box, n, &extents);
2903	}
2904
2905	if (dst->depth == src->depth) {
2906		tmp.dst.format = sna_render_format_for_depth(dst->depth);
2907		tmp.src.pict_format = tmp.dst.format;
2908	} else {
2909		tmp.dst.format = sna_format_for_depth(dst->depth);
2910		tmp.src.pict_format = sna_format_for_depth(src->depth);
2911	}
2912	if (!gen8_check_format(tmp.src.pict_format))
2913		goto fallback_blt;
2914
2915	tmp.dst.pixmap = (PixmapPtr)dst;
2916	tmp.dst.width  = dst->width;
2917	tmp.dst.height = dst->height;
2918	tmp.dst.bo = dst_bo;
2919	tmp.dst.x = tmp.dst.y = 0;
2920	tmp.damage = NULL;
2921
2922	sna_render_composite_redirect_init(&tmp);
2923	if (too_large(tmp.dst.width, tmp.dst.height)) {
2924		int i;
2925
2926		extents = box[0];
2927		for (i = 1; i < n; i++) {
2928			if (box[i].x1 < extents.x1)
2929				extents.x1 = box[i].x1;
2930			if (box[i].y1 < extents.y1)
2931				extents.y1 = box[i].y1;
2932
2933			if (box[i].x2 > extents.x2)
2934				extents.x2 = box[i].x2;
2935			if (box[i].y2 > extents.y2)
2936				extents.y2 = box[i].y2;
2937		}
2938
2939		if (!sna_render_composite_redirect(sna, &tmp,
2940						   extents.x1 + dst_dx,
2941						   extents.y1 + dst_dy,
2942						   extents.x2 - extents.x1,
2943						   extents.y2 - extents.y1,
2944						   n > 1))
2945			goto fallback_tiled;
2946	}
2947
2948	tmp.src.card_format = gen8_get_card_format(tmp.src.pict_format);
2949	if (too_large(src->width, src->height)) {
2950		int i;
2951
2952		extents = box[0];
2953		for (i = 1; i < n; i++) {
2954			if (box[i].x1 < extents.x1)
2955				extents.x1 = box[i].x1;
2956			if (box[i].y1 < extents.y1)
2957				extents.y1 = box[i].y1;
2958
2959			if (box[i].x2 > extents.x2)
2960				extents.x2 = box[i].x2;
2961			if (box[i].y2 > extents.y2)
2962				extents.y2 = box[i].y2;
2963		}
2964
2965		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
2966					       extents.x1 + src_dx,
2967					       extents.y1 + src_dy,
2968					       extents.x2 - extents.x1,
2969					       extents.y2 - extents.y1))
2970			goto fallback_tiled_dst;
2971	} else {
2972		tmp.src.bo = src_bo;
2973		tmp.src.width  = src->width;
2974		tmp.src.height = src->height;
2975		tmp.src.offset[0] = tmp.src.offset[1] = 0;
2976	}
2977
2978	tmp.mask.bo = NULL;
2979
2980	tmp.floats_per_vertex = 2;
2981	tmp.floats_per_rect = 6;
2982	tmp.need_magic_ca_pass = 0;
2983
2984	tmp.u.gen8.flags = COPY_FLAGS(alu);
2985
2986	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
2987	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
2988		kgem_submit(&sna->kgem);
2989		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
2990			if (tmp.src.bo != src_bo)
2991				kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2992			if (tmp.redirect.real_bo)
2993				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2994			goto fallback_blt;
2995		}
2996		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
2997	}
2998
2999	src_dx += tmp.src.offset[0];
3000	src_dy += tmp.src.offset[1];
3001
3002	dst_dx += tmp.dst.x;
3003	dst_dy += tmp.dst.y;
3004
3005	tmp.dst.x = tmp.dst.y = 0;
3006
3007	gen8_align_vertex(sna, &tmp);
3008	gen8_emit_copy_state(sna, &tmp);
3009
3010	do {
3011		int16_t *v;
3012		int n_this_time;
3013
3014		n_this_time = gen8_get_rectangles(sna, &tmp, n,
3015						  gen8_emit_copy_state);
3016		n -= n_this_time;
3017
3018		v = (int16_t *)(sna->render.vertices + sna->render.vertex_used);
3019		sna->render.vertex_used += 6 * n_this_time;
3020		assert(sna->render.vertex_used <= sna->render.vertex_size);
3021		do {
3022
3023			DBG(("	(%d, %d) -> (%d, %d) + (%d, %d)\n",
3024			     box->x1 + src_dx, box->y1 + src_dy,
3025			     box->x1 + dst_dx, box->y1 + dst_dy,
3026			     box->x2 - box->x1, box->y2 - box->y1));
3027			v[0] = box->x2 + dst_dx;
3028			v[2] = box->x2 + src_dx;
3029			v[1]  = v[5] = box->y2 + dst_dy;
3030			v[3]  = v[7] = box->y2 + src_dy;
3031			v[8]  = v[4] = box->x1 + dst_dx;
3032			v[10] = v[6] = box->x1 + src_dx;
3033			v[9]  = box->y1 + dst_dy;
3034			v[11] = box->y1 + src_dy;
3035			v += 12; box++;
3036		} while (--n_this_time);
3037	} while (n);
3038
3039	gen8_vertex_flush(sna);
3040	sna_render_composite_redirect_done(sna, &tmp);
3041	if (tmp.src.bo != src_bo)
3042		kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3043	return true;
3044
3045fallback_tiled_dst:
3046	if (tmp.redirect.real_bo)
3047		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
3048fallback_tiled:
3049	DBG(("%s: fallback tiled\n", __FUNCTION__));
3050	if (sna_blt_compare_depth(src, dst) &&
3051	    sna_blt_copy_boxes(sna, alu,
3052			       src_bo, src_dx, src_dy,
3053			       dst_bo, dst_dx, dst_dy,
3054			       dst->bitsPerPixel,
3055			       box, n))
3056		return true;
3057
3058	return sna_tiling_copy_boxes(sna, alu,
3059				     src, src_bo, src_dx, src_dy,
3060				     dst, dst_bo, dst_dx, dst_dy,
3061				     box, n);
3062}
3063
3064static void
3065gen8_render_copy_blt(struct sna *sna,
3066		     const struct sna_copy_op *op,
3067		     int16_t sx, int16_t sy,
3068		     int16_t w,  int16_t h,
3069		     int16_t dx, int16_t dy)
3070{
3071	int16_t *v;
3072
3073	gen8_get_rectangles(sna, &op->base, 1, gen8_emit_copy_state);
3074
3075	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3076	sna->render.vertex_used += 6;
3077	assert(sna->render.vertex_used <= sna->render.vertex_size);
3078
3079	v[0]  = dx+w; v[1]  = dy+h;
3080	v[2]  = sx+w; v[3]  = sy+h;
3081	v[4]  = dx;   v[5]  = dy+h;
3082	v[6]  = sx;   v[7]  = sy+h;
3083	v[8]  = dx;   v[9]  = dy;
3084	v[10] = sx;   v[11] = sy;
3085}
3086
3087static void
3088gen8_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
3089{
3090	if (sna->render.vertex_offset)
3091		gen8_vertex_flush(sna);
3092}
3093
3094static bool
3095gen8_render_copy(struct sna *sna, uint8_t alu,
3096		 PixmapPtr src, struct kgem_bo *src_bo,
3097		 PixmapPtr dst, struct kgem_bo *dst_bo,
3098		 struct sna_copy_op *op)
3099{
3100	DBG(("%s (alu=%d, src=(%dx%d), dst=(%dx%d))\n",
3101	     __FUNCTION__, alu,
3102	     src->drawable.width, src->drawable.height,
3103	     dst->drawable.width, dst->drawable.height));
3104
3105	if (prefer_blt_copy(sna, src_bo, dst_bo, 0) &&
3106	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
3107	    sna_blt_copy(sna, alu,
3108			 src_bo, dst_bo,
3109			 dst->drawable.bitsPerPixel,
3110			 op))
3111		return true;
3112
3113	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
3114	    too_large(src->drawable.width, src->drawable.height) ||
3115	    too_large(dst->drawable.width, dst->drawable.height) ||
3116	    unaligned(src_bo, src->drawable.bitsPerPixel) ||
3117	    unaligned(dst_bo, dst->drawable.bitsPerPixel)) {
3118fallback:
3119		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
3120			return false;
3121
3122		return sna_blt_copy(sna, alu, src_bo, dst_bo,
3123				    dst->drawable.bitsPerPixel,
3124				    op);
3125	}
3126
3127	if (dst->drawable.depth == src->drawable.depth) {
3128		op->base.dst.format = sna_render_format_for_depth(dst->drawable.depth);
3129		op->base.src.pict_format = op->base.dst.format;
3130	} else {
3131		op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
3132		op->base.src.pict_format = sna_format_for_depth(src->drawable.depth);
3133	}
3134	if (!gen8_check_format(op->base.src.pict_format))
3135		goto fallback;
3136
3137	op->base.dst.pixmap = dst;
3138	op->base.dst.width  = dst->drawable.width;
3139	op->base.dst.height = dst->drawable.height;
3140	op->base.dst.bo = dst_bo;
3141
3142	op->base.src.bo = src_bo;
3143	op->base.src.card_format =
3144		gen8_get_card_format(op->base.src.pict_format);
3145	op->base.src.width  = src->drawable.width;
3146	op->base.src.height = src->drawable.height;
3147
3148	op->base.mask.bo = NULL;
3149
3150	op->base.floats_per_vertex = 2;
3151	op->base.floats_per_rect = 6;
3152
3153	op->base.u.gen8.flags = COPY_FLAGS(alu);
3154
3155	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
3156	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
3157		kgem_submit(&sna->kgem);
3158		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
3159			goto fallback;
3160		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3161	}
3162
3163	gen8_align_vertex(sna, &op->base);
3164	gen8_emit_copy_state(sna, &op->base);
3165
3166	op->blt  = gen8_render_copy_blt;
3167	op->done = gen8_render_copy_done;
3168	return true;
3169}
3170
3171static void
3172gen8_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
3173{
3174	uint32_t *binding_table;
3175	uint16_t offset, dirty;
3176
3177	/* XXX Render Target Fast Clear
3178	 * Set RTFC Enable in PS and render a rectangle.
3179	 * Limited to a clearing the full MSC surface only with a
3180	 * specific kernel.
3181	 */
3182
3183	gen8_get_batch(sna, op);
3184
3185	binding_table = gen8_composite_get_binding_table(sna, &offset);
3186
3187	dirty = kgem_bo_is_dirty(op->dst.bo);
3188
3189	binding_table[0] =
3190		gen8_bind_bo(sna,
3191			     op->dst.bo, op->dst.width, op->dst.height,
3192			     gen8_get_dest_format(op->dst.format),
3193			     true);
3194	binding_table[1] =
3195		gen8_bind_bo(sna,
3196			     op->src.bo, 1, 1,
3197			     SURFACEFORMAT_B8G8R8A8_UNORM,
3198			     false);
3199
3200	if (sna->kgem.surface == offset &&
3201	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen8.surface_table) == *(uint64_t*)binding_table) {
3202		sna->kgem.surface += SURFACE_DW;
3203		offset = sna->render_state.gen8.surface_table;
3204	}
3205
3206	if (sna->kgem.batch[sna->render_state.gen8.surface_table] == binding_table[0])
3207		dirty = 0;
3208
3209	gen8_emit_state(sna, op, offset | dirty);
3210}
3211
3212static bool
3213gen8_render_fill_boxes(struct sna *sna,
3214		       CARD8 op,
3215		       PictFormat format,
3216		       const xRenderColor *color,
3217		       const DrawableRec *dst, struct kgem_bo *dst_bo,
3218		       const BoxRec *box, int n)
3219{
3220	struct sna_composite_op tmp;
3221	uint32_t pixel;
3222
3223	DBG(("%s (op=%d, color=(%04x, %04x, %04x, %04x) [%08x])\n",
3224	     __FUNCTION__, op,
3225	     color->red, color->green, color->blue, color->alpha, (int)format));
3226
3227	if (op >= ARRAY_SIZE(gen8_blend_op)) {
3228		DBG(("%s: fallback due to unhandled blend op: %d\n",
3229		     __FUNCTION__, op));
3230		return false;
3231	}
3232
3233	if (prefer_blt_fill(sna, dst_bo, FILL_BOXES) ||
3234	    !gen8_check_dst_format(format) ||
3235	    unaligned(dst_bo, PICT_FORMAT_BPP(format))) {
3236		uint8_t alu = GXinvalid;
3237
3238		if (op <= PictOpSrc) {
3239			pixel = 0;
3240			if (op == PictOpClear)
3241				alu = GXclear;
3242			else if (sna_get_pixel_from_rgba(&pixel,
3243							 color->red,
3244							 color->green,
3245							 color->blue,
3246							 color->alpha,
3247							 format))
3248				alu = GXcopy;
3249		}
3250
3251		if (alu != GXinvalid &&
3252		    sna_blt_fill_boxes(sna, alu,
3253				       dst_bo, dst->bitsPerPixel,
3254				       pixel, box, n))
3255			return true;
3256
3257		if (!gen8_check_dst_format(format))
3258			return false;
3259	}
3260
3261	if (op == PictOpClear) {
3262		pixel = 0;
3263		op = PictOpSrc;
3264	} else if (!sna_get_pixel_from_rgba(&pixel,
3265					    color->red,
3266					    color->green,
3267					    color->blue,
3268					    color->alpha,
3269					    PICT_a8r8g8b8))
3270		return false;
3271
3272	DBG(("%s(%08x x %d [(%d, %d), (%d, %d) ...])\n",
3273	     __FUNCTION__, pixel, n,
3274	     box[0].x1, box[0].y1, box[0].x2, box[0].y2));
3275
3276	tmp.dst.pixmap = (PixmapPtr)dst;
3277	tmp.dst.width  = dst->width;
3278	tmp.dst.height = dst->height;
3279	tmp.dst.format = format;
3280	tmp.dst.bo = dst_bo;
3281	tmp.dst.x = tmp.dst.y = 0;
3282	tmp.damage = NULL;
3283
3284	sna_render_composite_redirect_init(&tmp);
3285	if (too_large(dst->width, dst->height)) {
3286		BoxRec extents;
3287
3288		boxes_extents(box, n, &extents);
3289		if (!sna_render_composite_redirect(sna, &tmp,
3290						   extents.x1, extents.y1,
3291						   extents.x2 - extents.x1,
3292						   extents.y2 - extents.y1,
3293						   n > 1))
3294			return sna_tiling_fill_boxes(sna, op, format, color,
3295						     dst, dst_bo, box, n);
3296	}
3297
3298	tmp.src.bo = sna_render_get_solid(sna, pixel);
3299	tmp.mask.bo = NULL;
3300
3301	tmp.floats_per_vertex = 2;
3302	tmp.floats_per_rect = 6;
3303	tmp.need_magic_ca_pass = false;
3304
3305	tmp.u.gen8.flags = FILL_FLAGS(op, format);
3306
3307	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
3308	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3309		kgem_submit(&sna->kgem);
3310		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3311			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3312			tmp.src.bo = NULL;
3313
3314			if (tmp.redirect.real_bo) {
3315				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
3316				tmp.redirect.real_bo = NULL;
3317			}
3318
3319			return false;
3320		}
3321		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3322	}
3323
3324	gen8_align_vertex(sna, &tmp);
3325	gen8_emit_fill_state(sna, &tmp);
3326
3327	do {
3328		int n_this_time;
3329		int16_t *v;
3330
3331		n_this_time = gen8_get_rectangles(sna, &tmp, n,
3332						  gen8_emit_fill_state);
3333		n -= n_this_time;
3334
3335		v = (int16_t *)(sna->render.vertices + sna->render.vertex_used);
3336		sna->render.vertex_used += 6 * n_this_time;
3337		assert(sna->render.vertex_used <= sna->render.vertex_size);
3338		do {
3339			DBG(("	(%d, %d), (%d, %d)\n",
3340			     box->x1, box->y1, box->x2, box->y2));
3341
3342			v[0] = box->x2;
3343			v[5] = v[1] = box->y2;
3344			v[8] = v[4] = box->x1;
3345			v[9] = box->y1;
3346			v[2] = v[3]  = v[7]  = 1;
3347			v[6] = v[10] = v[11] = 0;
3348			v += 12; box++;
3349		} while (--n_this_time);
3350	} while (n);
3351
3352	gen8_vertex_flush(sna);
3353	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3354	sna_render_composite_redirect_done(sna, &tmp);
3355	return true;
3356}
3357
3358static void
3359gen8_render_fill_op_blt(struct sna *sna,
3360			const struct sna_fill_op *op,
3361			int16_t x, int16_t y, int16_t w, int16_t h)
3362{
3363	int16_t *v;
3364
3365	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));
3366
3367	gen8_get_rectangles(sna, &op->base, 1, gen8_emit_fill_state);
3368
3369	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3370	sna->render.vertex_used += 6;
3371	assert(sna->render.vertex_used <= sna->render.vertex_size);
3372
3373	v[0] = x+w;
3374	v[4] = v[8] = x;
3375	v[1] = v[5] = y+h;
3376	v[9] = y;
3377
3378	v[2] = v[3]  = v[7]  = 1;
3379	v[6] = v[10] = v[11] = 0;
3380}
3381
3382fastcall static void
3383gen8_render_fill_op_box(struct sna *sna,
3384			const struct sna_fill_op *op,
3385			const BoxRec *box)
3386{
3387	int16_t *v;
3388
3389	DBG(("%s: (%d, %d),(%d, %d)\n", __FUNCTION__,
3390	     box->x1, box->y1, box->x2, box->y2));
3391
3392	gen8_get_rectangles(sna, &op->base, 1, gen8_emit_fill_state);
3393
3394	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3395	sna->render.vertex_used += 6;
3396	assert(sna->render.vertex_used <= sna->render.vertex_size);
3397
3398	v[0] = box->x2;
3399	v[8] = v[4] = box->x1;
3400	v[5] = v[1] = box->y2;
3401	v[9] = box->y1;
3402
3403	v[7] = v[2]  = v[3]  = 1;
3404	v[6] = v[10] = v[11] = 0;
3405}
3406
3407fastcall static void
3408gen8_render_fill_op_boxes(struct sna *sna,
3409			  const struct sna_fill_op *op,
3410			  const BoxRec *box,
3411			  int nbox)
3412{
3413	DBG(("%s: (%d, %d),(%d, %d)... x %d\n", __FUNCTION__,
3414	     box->x1, box->y1, box->x2, box->y2, nbox));
3415
3416	do {
3417		int nbox_this_time;
3418		int16_t *v;
3419
3420		nbox_this_time = gen8_get_rectangles(sna, &op->base, nbox,
3421						     gen8_emit_fill_state);
3422		nbox -= nbox_this_time;
3423
3424		v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3425		sna->render.vertex_used += 6 * nbox_this_time;
3426		assert(sna->render.vertex_used <= sna->render.vertex_size);
3427
3428		do {
3429			v[0] = box->x2;
3430			v[8] = v[4] = box->x1;
3431			v[5] = v[1] = box->y2;
3432			v[9] = box->y1;
3433			v[7] = v[2]  = v[3]  = 1;
3434			v[6] = v[10] = v[11] = 0;
3435			box++; v += 12;
3436		} while (--nbox_this_time);
3437	} while (nbox);
3438}
3439
3440static void
3441gen8_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
3442{
3443	if (sna->render.vertex_offset)
3444		gen8_vertex_flush(sna);
3445	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
3446}
3447
3448static bool
3449gen8_render_fill(struct sna *sna, uint8_t alu,
3450		 PixmapPtr dst, struct kgem_bo *dst_bo,
3451		 uint32_t color, unsigned flags,
3452		 struct sna_fill_op *op)
3453{
3454	DBG(("%s: (alu=%d, color=%x)\n", __FUNCTION__, alu, color));
3455
3456	if (prefer_blt_fill(sna, dst_bo, flags) &&
3457	    sna_blt_fill(sna, alu,
3458			 dst_bo, dst->drawable.bitsPerPixel,
3459			 color,
3460			 op))
3461		return true;
3462
3463	if (!(alu == GXcopy || alu == GXclear) ||
3464	    too_large(dst->drawable.width, dst->drawable.height) ||
3465	    unaligned(dst_bo, dst->drawable.bitsPerPixel))
3466		return sna_blt_fill(sna, alu,
3467				    dst_bo, dst->drawable.bitsPerPixel,
3468				    color,
3469				    op);
3470
3471	if (alu == GXclear)
3472		color = 0;
3473
3474	op->base.dst.pixmap = dst;
3475	op->base.dst.width  = dst->drawable.width;
3476	op->base.dst.height = dst->drawable.height;
3477	op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
3478	op->base.dst.bo = dst_bo;
3479	op->base.dst.x = op->base.dst.y = 0;
3480
3481	op->base.src.bo =
3482		sna_render_get_solid(sna,
3483				     sna_rgba_for_color(color,
3484							dst->drawable.depth));
3485	op->base.mask.bo = NULL;
3486
3487	op->base.need_magic_ca_pass = false;
3488	op->base.floats_per_vertex = 2;
3489	op->base.floats_per_rect = 6;
3490
3491	op->base.u.gen8.flags = FILL_FLAGS_NOBLEND;
3492
3493	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
3494	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3495		kgem_submit(&sna->kgem);
3496		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
3497			kgem_bo_destroy(&sna->kgem, op->base.src.bo);
3498			return false;
3499		}
3500
3501		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3502	}
3503
3504	gen8_align_vertex(sna, &op->base);
3505	gen8_emit_fill_state(sna, &op->base);
3506
3507	op->blt   = gen8_render_fill_op_blt;
3508	op->box   = gen8_render_fill_op_box;
3509	op->boxes = gen8_render_fill_op_boxes;
3510	op->points = NULL;
3511	op->done  = gen8_render_fill_op_done;
3512	return true;
3513}
3514
3515static bool
3516gen8_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3517			     uint32_t color,
3518			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
3519			     uint8_t alu)
3520{
3521	BoxRec box;
3522
3523	box.x1 = x1;
3524	box.y1 = y1;
3525	box.x2 = x2;
3526	box.y2 = y2;
3527
3528	return sna_blt_fill_boxes(sna, alu,
3529				  bo, dst->drawable.bitsPerPixel,
3530				  color, &box, 1);
3531}
3532
3533static bool
3534gen8_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3535		     uint32_t color,
3536		     int16_t x1, int16_t y1,
3537		     int16_t x2, int16_t y2,
3538		     uint8_t alu)
3539{
3540	struct sna_composite_op tmp;
3541	int16_t *v;
3542
3543	/* Prefer to use the BLT if already engaged */
3544	if (prefer_blt_fill(sna, bo, FILL_BOXES) &&
3545	    gen8_render_fill_one_try_blt(sna, dst, bo, color,
3546					 x1, y1, x2, y2, alu))
3547		return true;
3548
3549	/* Must use the BLT if we can't RENDER... */
3550	if (!(alu == GXcopy || alu == GXclear) ||
3551	    too_large(dst->drawable.width, dst->drawable.height) ||
3552	    unaligned(bo, dst->drawable.bitsPerPixel))
3553		return gen8_render_fill_one_try_blt(sna, dst, bo, color,
3554						    x1, y1, x2, y2, alu);
3555
3556	if (alu == GXclear)
3557		color = 0;
3558
3559	tmp.dst.pixmap = dst;
3560	tmp.dst.width  = dst->drawable.width;
3561	tmp.dst.height = dst->drawable.height;
3562	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
3563	tmp.dst.bo = bo;
3564	tmp.dst.x = tmp.dst.y = 0;
3565
3566	tmp.src.bo =
3567		sna_render_get_solid(sna,
3568				     sna_rgba_for_color(color,
3569							dst->drawable.depth));
3570	tmp.mask.bo = NULL;
3571
3572	tmp.floats_per_vertex = 2;
3573	tmp.floats_per_rect = 6;
3574	tmp.need_magic_ca_pass = false;
3575
3576	tmp.u.gen8.flags = FILL_FLAGS_NOBLEND;
3577
3578	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
3579	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3580		kgem_submit(&sna->kgem);
3581		if (kgem_check_bo(&sna->kgem, bo, NULL)) {
3582			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3583			return false;
3584		}
3585		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3586	}
3587
3588	gen8_align_vertex(sna, &tmp);
3589	gen8_emit_fill_state(sna, &tmp);
3590
3591	gen8_get_rectangles(sna, &tmp, 1, gen8_emit_fill_state);
3592
3593	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
3594
3595	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3596	sna->render.vertex_used += 6;
3597	assert(sna->render.vertex_used <= sna->render.vertex_size);
3598
3599	v[0] = x2;
3600	v[8] = v[4] = x1;
3601	v[5] = v[1] = y2;
3602	v[9] = y1;
3603	v[7] = v[2]  = v[3]  = 1;
3604	v[6] = v[10] = v[11] = 0;
3605
3606	gen8_vertex_flush(sna);
3607	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3608
3609	return true;
3610}
3611
3612static bool
3613gen8_render_clear_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
3614{
3615	BoxRec box;
3616
3617	box.x1 = 0;
3618	box.y1 = 0;
3619	box.x2 = dst->drawable.width;
3620	box.y2 = dst->drawable.height;
3621
3622	return sna_blt_fill_boxes(sna, GXclear,
3623				  bo, dst->drawable.bitsPerPixel,
3624				  0, &box, 1);
3625}
3626
3627static bool
3628gen8_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
3629{
3630	struct sna_composite_op tmp;
3631	int16_t *v;
3632
3633	DBG(("%s: %dx%d\n",
3634	     __FUNCTION__,
3635	     dst->drawable.width,
3636	     dst->drawable.height));
3637
3638	/* Prefer to use the BLT if already engaged */
3639	if (sna->kgem.mode == KGEM_BLT &&
3640	    gen8_render_clear_try_blt(sna, dst, bo))
3641		return true;
3642
3643	/* Must use the BLT if we can't RENDER... */
3644	if (too_large(dst->drawable.width, dst->drawable.height) ||
3645	    unaligned(bo, dst->drawable.bitsPerPixel))
3646		return gen8_render_clear_try_blt(sna, dst, bo);
3647
3648	tmp.dst.pixmap = dst;
3649	tmp.dst.width  = dst->drawable.width;
3650	tmp.dst.height = dst->drawable.height;
3651	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
3652	tmp.dst.bo = bo;
3653	tmp.dst.x = tmp.dst.y = 0;
3654
3655	tmp.src.bo = sna_render_get_solid(sna, 0);
3656	tmp.mask.bo = NULL;
3657
3658	tmp.floats_per_vertex = 2;
3659	tmp.floats_per_rect = 6;
3660	tmp.need_magic_ca_pass = false;
3661
3662	tmp.u.gen8.flags = FILL_FLAGS_NOBLEND;
3663
3664	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
3665	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3666		kgem_submit(&sna->kgem);
3667		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3668			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3669			return false;
3670		}
3671		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3672	}
3673
3674	gen8_align_vertex(sna, &tmp);
3675	gen8_emit_fill_state(sna, &tmp);
3676
3677	gen8_get_rectangles(sna, &tmp, 1, gen8_emit_fill_state);
3678
3679	v = (int16_t *)&sna->render.vertices[sna->render.vertex_used];
3680	sna->render.vertex_used += 6;
3681	assert(sna->render.vertex_used <= sna->render.vertex_size);
3682
3683	v[0] = dst->drawable.width;
3684	v[5] = v[1] = dst->drawable.height;
3685	v[8] = v[4] = 0;
3686	v[9] = 0;
3687
3688	v[7] = v[2]  = v[3]  = 1;
3689	v[6] = v[10] = v[11] = 0;
3690
3691	gen8_vertex_flush(sna);
3692	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3693
3694	return true;
3695}
3696
3697#if !NO_VIDEO
3698static uint32_t gen8_bind_video_source(struct sna *sna,
3699				       struct kgem_bo *bo,
3700				       uint32_t delta,
3701				       int width,
3702				       int height,
3703				       int pitch,
3704				       uint32_t format)
3705{
3706	uint32_t *ss;
3707	int offset;
3708
3709	offset = sna->kgem.surface -= SURFACE_DW;
3710	ss = sna->kgem.batch + offset;
3711	ss[0] = (SURFACE_2D << SURFACE_TYPE_SHIFT |
3712		 gen8_tiling_bits(bo->tiling) |
3713		 format << SURFACE_FORMAT_SHIFT |
3714		 SURFACE_VALIGN_4 | SURFACE_HALIGN_4);
3715	ss[1] = 0;
3716	ss[2] = ((width - 1)  << SURFACE_WIDTH_SHIFT |
3717		 (height - 1) << SURFACE_HEIGHT_SHIFT);
3718	ss[3] = (pitch - 1) << SURFACE_PITCH_SHIFT;
3719	ss[4] = 0;
3720	ss[5] = 0;
3721	ss[6] = 0;
3722	ss[7] = SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
3723	*(uint64_t *)(ss+8) =
3724		kgem_add_reloc64(&sna->kgem, offset + 8, bo,
3725				 I915_GEM_DOMAIN_SAMPLER << 16,
3726				 delta);
3727	ss[10] = 0;
3728	ss[11] = 0;
3729	ss[12] = 0;
3730	ss[13] = 0;
3731	ss[14] = 0;
3732	ss[15] = 0;
3733
3734	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> sampler\n",
3735	     offset, bo->handle, ss[1],
3736	     format, width, height, bo->pitch, bo->tiling));
3737
3738	return offset * sizeof(uint32_t);
3739}
3740
3741static void gen8_emit_video_state(struct sna *sna,
3742				  const struct sna_composite_op *op)
3743{
3744	struct sna_video_frame *frame = op->priv;
3745	uint32_t src_surf_format[6];
3746	uint32_t src_surf_base[6];
3747	int src_width[6];
3748	int src_height[6];
3749	int src_pitch[6];
3750	uint32_t *binding_table;
3751	uint16_t offset;
3752	int n_src, n;
3753
3754	/* XXX VeBox, bicubic */
3755
3756	gen8_get_batch(sna, op);
3757
3758	src_surf_base[0] = 0;
3759	src_surf_base[1] = 0;
3760	src_surf_base[2] = frame->VBufOffset;
3761	src_surf_base[3] = frame->VBufOffset;
3762	src_surf_base[4] = frame->UBufOffset;
3763	src_surf_base[5] = frame->UBufOffset;
3764
3765	if (is_planar_fourcc(frame->id)) {
3766		for (n = 0; n < 2; n++) {
3767			src_surf_format[n] = SURFACEFORMAT_R8_UNORM;
3768			src_width[n] = frame->width;
3769			src_height[n] = frame->height;
3770			src_pitch[n] = frame->pitch[1];
3771		}
3772		for (; n < 6; n++) {
3773			if (is_nv12_fourcc(frame->id))
3774				src_surf_format[n] = SURFACEFORMAT_R8G8_UNORM;
3775			else
3776				src_surf_format[n] = SURFACEFORMAT_R8_UNORM;
3777			src_width[n] = frame->width / 2;
3778			src_height[n] = frame->height / 2;
3779			src_pitch[n] = frame->pitch[0];
3780		}
3781		n_src = 6;
3782	} else {
3783		if (frame->id == FOURCC_RGB888)
3784			src_surf_format[0] = SURFACEFORMAT_B8G8R8X8_UNORM;
3785		else if (frame->id == FOURCC_UYVY)
3786			src_surf_format[0] = SURFACEFORMAT_YCRCB_SWAPY;
3787		else
3788			src_surf_format[0] = SURFACEFORMAT_YCRCB_NORMAL;
3789
3790		src_width[0]  = frame->width;
3791		src_height[0] = frame->height;
3792		src_pitch[0]  = frame->pitch[0];
3793		n_src = 1;
3794	}
3795
3796	binding_table = gen8_composite_get_binding_table(sna, &offset);
3797
3798	binding_table[0] =
3799		gen8_bind_bo(sna,
3800			     op->dst.bo, op->dst.width, op->dst.height,
3801			     gen8_get_dest_format(op->dst.format),
3802			     true);
3803	for (n = 0; n < n_src; n++) {
3804		binding_table[1+n] =
3805			gen8_bind_video_source(sna,
3806					       frame->bo,
3807					       src_surf_base[n],
3808					       src_width[n],
3809					       src_height[n],
3810					       src_pitch[n],
3811					       src_surf_format[n]);
3812	}
3813
3814	gen8_emit_state(sna, op, offset);
3815}
3816
3817static unsigned select_video_kernel(const struct sna_video *video,
3818				    const struct sna_video_frame *frame)
3819{
3820	switch (frame->id) {
3821	case FOURCC_YV12:
3822	case FOURCC_I420:
3823	case FOURCC_XVMC:
3824		return video->colorspace ?
3825			GEN8_WM_KERNEL_VIDEO_PLANAR_BT709 :
3826			GEN8_WM_KERNEL_VIDEO_PLANAR_BT601;
3827
3828	case FOURCC_NV12:
3829		return video->colorspace ?
3830			GEN8_WM_KERNEL_VIDEO_NV12_BT709 :
3831			GEN8_WM_KERNEL_VIDEO_NV12_BT601;
3832
3833	case FOURCC_RGB888:
3834	case FOURCC_RGB565:
3835		return GEN8_WM_KERNEL_VIDEO_RGB;
3836
3837	default:
3838		return video->colorspace ?
3839			GEN8_WM_KERNEL_VIDEO_PACKED_BT709 :
3840			GEN8_WM_KERNEL_VIDEO_PACKED_BT601;
3841	}
3842}
3843
3844static bool
3845gen8_render_video(struct sna *sna,
3846		  struct sna_video *video,
3847		  struct sna_video_frame *frame,
3848		  RegionPtr dstRegion,
3849		  PixmapPtr pixmap)
3850{
3851	struct sna_composite_op tmp;
3852	struct sna_pixmap *priv = sna_pixmap(pixmap);
3853	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
3854	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
3855	int src_width = frame->src.x2 - frame->src.x1;
3856	int src_height = frame->src.y2 - frame->src.y1;
3857	float src_offset_x, src_offset_y;
3858	float src_scale_x, src_scale_y;
3859	unsigned filter;
3860	const BoxRec *box;
3861	int nbox;
3862
3863	DBG(("%s: src=(%d, %d), dst=(%d, %d), %dx[(%d, %d), (%d, %d)...]\n",
3864	     __FUNCTION__,
3865	     src_width, src_height, dst_width, dst_height,
3866	     region_num_rects(dstRegion),
3867	     REGION_EXTENTS(NULL, dstRegion)->x1,
3868	     REGION_EXTENTS(NULL, dstRegion)->y1,
3869	     REGION_EXTENTS(NULL, dstRegion)->x2,
3870	     REGION_EXTENTS(NULL, dstRegion)->y2));
3871
3872	assert(priv->gpu_bo);
3873	assert(!too_large(pixmap->drawable.width, pixmap->drawable.height));
3874	assert(!unaligned(priv->gpu_bo, pixmap->drawable.bitsPerPixel));
3875
3876	memset(&tmp, 0, sizeof(tmp));
3877
3878	tmp.dst.pixmap = pixmap;
3879	tmp.dst.width  = pixmap->drawable.width;
3880	tmp.dst.height = pixmap->drawable.height;
3881	tmp.dst.format = sna_render_format_for_depth(pixmap->drawable.depth);
3882	tmp.dst.bo = priv->gpu_bo;
3883
3884	tmp.src.bo = frame->bo;
3885	tmp.mask.bo = NULL;
3886
3887	tmp.floats_per_vertex = 3;
3888	tmp.floats_per_rect = 9;
3889
3890	DBG(("%s: scaling?=%d, planar?=%d [%x]\n",
3891	     __FUNCTION__,
3892	     src_width != dst_width || src_height != dst_height,
3893	     is_planar_fourcc(frame->id), frame->id));
3894
3895	if (src_width == dst_width && src_height == dst_height)
3896		filter = SAMPLER_FILTER_NEAREST;
3897	else
3898		filter = SAMPLER_FILTER_BILINEAR;
3899
3900	tmp.u.gen8.flags =
3901		GEN8_SET_FLAGS(SAMPLER_OFFSET(filter, SAMPLER_EXTEND_PAD,
3902					      SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE),
3903			       NO_BLEND,
3904			       select_video_kernel(video, frame),
3905			       2);
3906	tmp.priv = frame;
3907
3908	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
3909	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
3910		kgem_submit(&sna->kgem);
3911		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
3912			return false;
3913
3914		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
3915	}
3916
3917	gen8_align_vertex(sna, &tmp);
3918	gen8_emit_video_state(sna, &tmp);
3919
3920	DBG(("%s: src=(%d, %d)x(%d, %d); frame=(%dx%d), dst=(%dx%d)\n",
3921	     __FUNCTION__,
3922	     frame->src.x1, frame->src.y1,
3923	     src_width, src_height,
3924	     dst_width, dst_height,
3925	     frame->width, frame->height));
3926
3927	src_scale_x = (float)src_width / dst_width / frame->width;
3928	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
3929
3930	src_scale_y = (float)src_height / dst_height / frame->height;
3931	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
3932
3933	DBG(("%s: scale=(%f, %f), offset=(%f, %f)\n",
3934	     __FUNCTION__,
3935	     src_scale_x, src_scale_y,
3936	     src_offset_x, src_offset_y));
3937
3938	box = region_rects(dstRegion);
3939	nbox = region_num_rects(dstRegion);
3940	while (nbox--) {
3941		DBG(("%s: dst=(%d, %d), (%d, %d) + (%d, %d); src=(%f, %f), (%f, %f)\n",
3942		     __FUNCTION__,
3943		     box->x1, box->y1,
3944		     box->x2, box->y2,
3945		     box->x1 * src_scale_x + src_offset_x,
3946		     box->y1 * src_scale_y + src_offset_y,
3947		     box->x2 * src_scale_x + src_offset_x,
3948		     box->y2 * src_scale_y + src_offset_y));
3949
3950		gen8_get_rectangles(sna, &tmp, 1, gen8_emit_video_state);
3951
3952		OUT_VERTEX(box->x2, box->y2);
3953		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
3954		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
3955
3956		OUT_VERTEX(box->x1, box->y2);
3957		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
3958		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
3959
3960		OUT_VERTEX(box->x1, box->y1);
3961		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
3962		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
3963
3964		box++;
3965	}
3966	gen8_vertex_flush(sna);
3967
3968	if (!DAMAGE_IS_ALL(priv->gpu_damage))
3969		sna_damage_add(&priv->gpu_damage, dstRegion);
3970
3971	return true;
3972}
3973#endif
3974
3975static void gen8_render_flush(struct sna *sna)
3976{
3977	gen8_vertex_close(sna);
3978
3979	assert(sna->render.vb_id == 0);
3980	assert(sna->render.vertex_offset == 0);
3981}
3982
3983static void gen8_render_reset(struct sna *sna)
3984{
3985	sna->render_state.gen8.emit_flush = false;
3986	sna->render_state.gen8.needs_invariant = true;
3987	sna->render_state.gen8.ve_id = 3 << 2;
3988	sna->render_state.gen8.last_primitive = -1;
3989
3990	sna->render_state.gen8.num_sf_outputs = 0;
3991	sna->render_state.gen8.samplers = -1;
3992	sna->render_state.gen8.blend = -1;
3993	sna->render_state.gen8.kernel = -1;
3994	sna->render_state.gen8.drawrect_offset = -1;
3995	sna->render_state.gen8.drawrect_limit = -1;
3996	sna->render_state.gen8.surface_table = 0;
3997
3998	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
3999		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
4000		discard_vbo(sna);
4001	}
4002
4003	sna->render.vertex_offset = 0;
4004	sna->render.nvertex_reloc = 0;
4005	sna->render.vb_id = 0;
4006}
4007
4008static void gen8_render_fini(struct sna *sna)
4009{
4010	kgem_bo_destroy(&sna->kgem, sna->render_state.gen8.general_bo);
4011}
4012
4013static bool gen8_render_setup(struct sna *sna)
4014{
4015	struct gen8_render_state *state = &sna->render_state.gen8;
4016	struct sna_static_stream general;
4017	struct gen8_sampler_state *ss;
4018	int i, j, k, l, m;
4019	uint32_t devid;
4020
4021	devid = intel_get_device_id(sna->dev);
4022	if (devid & 0xf)
4023		state->gt = ((devid >> 4) & 0xf) + 1;
4024	DBG(("%s: gt=%d\n", __FUNCTION__, state->gt));
4025
4026	if (is_bdw(sna))
4027		state->info = &bdw_gt_info;
4028	else if (is_chv(sna))
4029		state->info = &chv_gt_info;
4030	else
4031		return false;
4032
4033	sna_static_stream_init(&general);
4034
4035	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
4036	 * dumps, you know it points to zero.
4037	 */
4038	null_create(&general);
4039
4040	for (m = 0; m < ARRAY_SIZE(wm_kernels); m++) {
4041		if (wm_kernels[m].size) {
4042			state->wm_kernel[m][1] =
4043				sna_static_stream_add(&general,
4044						      wm_kernels[m].data,
4045						      wm_kernels[m].size,
4046						      64);
4047		} else {
4048			if (USE_8_PIXEL_DISPATCH) {
4049				state->wm_kernel[m][0] =
4050					sna_static_stream_compile_wm(sna, &general,
4051								     wm_kernels[m].data, 8);
4052			}
4053
4054			if (USE_16_PIXEL_DISPATCH) {
4055				state->wm_kernel[m][1] =
4056					sna_static_stream_compile_wm(sna, &general,
4057								     wm_kernels[m].data, 16);
4058			}
4059
4060			if (USE_32_PIXEL_DISPATCH) {
4061				state->wm_kernel[m][2] =
4062					sna_static_stream_compile_wm(sna, &general,
4063								     wm_kernels[m].data, 32);
4064			}
4065		}
4066		assert(state->wm_kernel[m][0]|state->wm_kernel[m][1]|state->wm_kernel[m][2]);
4067	}
4068
4069	COMPILE_TIME_ASSERT(SAMPLER_OFFSET(FILTER_COUNT, EXTEND_COUNT, FILTER_COUNT, EXTEND_COUNT) <= 0x7ff);
4070	ss = sna_static_stream_map(&general,
4071				   2 * sizeof(*ss) *
4072				   (2 +
4073				    FILTER_COUNT * EXTEND_COUNT *
4074				    FILTER_COUNT * EXTEND_COUNT),
4075				   32);
4076	state->wm_state = sna_static_stream_offsetof(&general, ss);
4077	sampler_copy_init(ss); ss += 2;
4078	sampler_fill_init(ss); ss += 2;
4079	for (i = 0; i < FILTER_COUNT; i++) {
4080		for (j = 0; j < EXTEND_COUNT; j++) {
4081			for (k = 0; k < FILTER_COUNT; k++) {
4082				for (l = 0; l < EXTEND_COUNT; l++) {
4083					sampler_state_init(ss++, i, j);
4084					sampler_state_init(ss++, k, l);
4085				}
4086			}
4087		}
4088	}
4089
4090	state->cc_blend = gen8_create_blend_state(&general);
4091
4092	state->general_bo = sna_static_stream_fini(sna, &general);
4093	return state->general_bo != NULL;
4094}
4095
4096const char *gen8_render_init(struct sna *sna, const char *backend)
4097{
4098	if (!gen8_render_setup(sna))
4099		return backend;
4100
4101	sna->kgem.context_switch = gen6_render_context_switch;
4102	sna->kgem.retire = gen6_render_retire;
4103	sna->kgem.expire = gen4_render_expire;
4104
4105#if !NO_COMPOSITE
4106	sna->render.composite = gen8_render_composite;
4107	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
4108#endif
4109#if !NO_COMPOSITE_SPANS
4110	sna->render.check_composite_spans = gen8_check_composite_spans;
4111	sna->render.composite_spans = gen8_render_composite_spans;
4112	sna->render.prefer_gpu |= PREFER_GPU_SPANS;
4113#endif
4114#if !NO_VIDEO
4115	sna->render.video = gen8_render_video;
4116#endif
4117
4118#if !NO_COPY_BOXES
4119	sna->render.copy_boxes = gen8_render_copy_boxes;
4120#endif
4121#if !NO_COPY
4122	sna->render.copy = gen8_render_copy;
4123#endif
4124
4125#if !NO_FILL_BOXES
4126	sna->render.fill_boxes = gen8_render_fill_boxes;
4127#endif
4128#if !NO_FILL
4129	sna->render.fill = gen8_render_fill;
4130#endif
4131#if !NO_FILL_ONE
4132	sna->render.fill_one = gen8_render_fill_one;
4133#endif
4134#if !NO_FILL_CLEAR
4135	sna->render.clear = gen8_render_clear;
4136#endif
4137
4138	sna->render.flush = gen8_render_flush;
4139	sna->render.reset = gen8_render_reset;
4140	sna->render.fini = gen8_render_fini;
4141
4142	sna->render.max_3d_size = GEN8_MAX_SIZE;
4143	sna->render.max_3d_pitch = 1 << 18;
4144	return sna->render_state.gen8.info->name;
4145}
4146