1/*
2 * Copyright © 2006,2008,2011 Intel Corporation
3 * Copyright © 2007 Red Hat, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 *    Wang Zhenyu <zhenyu.z.wang@sna.com>
26 *    Eric Anholt <eric@anholt.net>
27 *    Carl Worth <cworth@redhat.com>
28 *    Keith Packard <keithp@keithp.com>
29 *    Chris Wilson <chris@chris-wilson.co.uk>
30 *
31 */
32
33#ifdef HAVE_CONFIG_H
34#include "config.h"
35#endif
36
37#include "sna.h"
38#include "sna_reg.h"
39#include "sna_render.h"
40#include "sna_render_inline.h"
41#include "sna_video.h"
42
43#include "brw/brw.h"
44#include "gen4_common.h"
45#include "gen4_render.h"
46#include "gen4_source.h"
47#include "gen4_vertex.h"
48
49/* gen4 has a serious issue with its shaders that we need to flush
50 * after every rectangle... So until that is resolved, prefer
51 * the BLT engine.
52 */
53#define FORCE_SPANS 0
54#define FORCE_NONRECTILINEAR_SPANS -1
55#define FORCE_FLUSH 1 /* https://bugs.freedesktop.org/show_bug.cgi?id=55500 */
56
57#define ALWAYS_FLUSH 1
58
59#define NO_COMPOSITE 0
60#define NO_COMPOSITE_SPANS 0
61#define NO_COPY 0
62#define NO_COPY_BOXES 0
63#define NO_FILL 0
64#define NO_FILL_ONE 0
65#define NO_FILL_BOXES 0
66#define NO_VIDEO 0
67
68#define MAX_FLUSH_VERTICES 1 /* was 6, https://bugs.freedesktop.org/show_bug.cgi?id=55500 */
69
70#define GEN4_GRF_BLOCKS(nreg)    ((nreg + 15) / 16 - 1)
71
72/* Set up a default static partitioning of the URB, which is supposed to
73 * allow anything we would want to do, at potentially lower performance.
74 */
75#define URB_CS_ENTRY_SIZE     1
76#define URB_CS_ENTRIES	      0
77
78#define URB_VS_ENTRY_SIZE     1
79#define URB_VS_ENTRIES	      32
80
81#define URB_GS_ENTRY_SIZE     0
82#define URB_GS_ENTRIES	      0
83
84#define URB_CL_ENTRY_SIZE   0
85#define URB_CL_ENTRIES      0
86
87#define URB_SF_ENTRY_SIZE     2
88#define URB_SF_ENTRIES	      64
89
90/*
91 * this program computes dA/dx and dA/dy for the texture coordinates along
92 * with the base texture coordinate. It was extracted from the Mesa driver
93 */
94
95#define SF_KERNEL_NUM_GRF 16
96#define PS_KERNEL_NUM_GRF 32
97
98#define GEN4_MAX_SF_THREADS 24
99#define GEN4_MAX_WM_THREADS 32
100#define G4X_MAX_WM_THREADS 50
101
102static const uint32_t ps_kernel_packed_bt601_static[][4] = {
103#include "exa_wm_xy.g4b"
104#include "exa_wm_src_affine.g4b"
105#include "exa_wm_src_sample_argb.g4b"
106#include "exa_wm_yuv_rgb_bt601.g4b"
107#include "exa_wm_write.g4b"
108};
109
110static const uint32_t ps_kernel_planar_bt601_static[][4] = {
111#include "exa_wm_xy.g4b"
112#include "exa_wm_src_affine.g4b"
113#include "exa_wm_src_sample_planar.g4b"
114#include "exa_wm_yuv_rgb_bt601.g4b"
115#include "exa_wm_write.g4b"
116};
117
118static const uint32_t ps_kernel_nv12_bt601_static[][4] = {
119#include "exa_wm_xy.g4b"
120#include "exa_wm_src_affine.g4b"
121#include "exa_wm_src_sample_nv12.g4b"
122#include "exa_wm_yuv_rgb_bt601.g4b"
123#include "exa_wm_write.g4b"
124};
125
126static const uint32_t ps_kernel_packed_bt709_static[][4] = {
127#include "exa_wm_xy.g4b"
128#include "exa_wm_src_affine.g4b"
129#include "exa_wm_src_sample_argb.g4b"
130#include "exa_wm_yuv_rgb_bt709.g4b"
131#include "exa_wm_write.g4b"
132};
133
134static const uint32_t ps_kernel_planar_bt709_static[][4] = {
135#include "exa_wm_xy.g4b"
136#include "exa_wm_src_affine.g4b"
137#include "exa_wm_src_sample_planar.g4b"
138#include "exa_wm_yuv_rgb_bt709.g4b"
139#include "exa_wm_write.g4b"
140};
141
142static const uint32_t ps_kernel_nv12_bt709_static[][4] = {
143#include "exa_wm_xy.g4b"
144#include "exa_wm_src_affine.g4b"
145#include "exa_wm_src_sample_nv12.g4b"
146#include "exa_wm_yuv_rgb_bt709.g4b"
147#include "exa_wm_write.g4b"
148};
149
150#define NOKERNEL(kernel_enum, func, masked) \
151    [kernel_enum] = {func, 0, masked}
152#define KERNEL(kernel_enum, kernel, masked) \
153    [kernel_enum] = {&kernel, sizeof(kernel), masked}
154static const struct wm_kernel_info {
155	const void *data;
156	unsigned int size;
157	bool has_mask;
158} wm_kernels[] = {
159	NOKERNEL(WM_KERNEL, brw_wm_kernel__affine, false),
160	NOKERNEL(WM_KERNEL_P, brw_wm_kernel__projective, false),
161
162	NOKERNEL(WM_KERNEL_MASK, brw_wm_kernel__affine_mask, true),
163	NOKERNEL(WM_KERNEL_MASK_P, brw_wm_kernel__projective_mask, true),
164
165	NOKERNEL(WM_KERNEL_MASKCA, brw_wm_kernel__affine_mask_ca, true),
166	NOKERNEL(WM_KERNEL_MASKCA_P, brw_wm_kernel__projective_mask_ca, true),
167
168	NOKERNEL(WM_KERNEL_MASKSA, brw_wm_kernel__affine_mask_sa, true),
169	NOKERNEL(WM_KERNEL_MASKSA_P, brw_wm_kernel__projective_mask_sa, true),
170
171	NOKERNEL(WM_KERNEL_OPACITY, brw_wm_kernel__affine_opacity, true),
172	NOKERNEL(WM_KERNEL_OPACITY_P, brw_wm_kernel__projective_opacity, true),
173
174	KERNEL(WM_KERNEL_VIDEO_PLANAR_BT601, ps_kernel_planar_bt601_static, false),
175	KERNEL(WM_KERNEL_VIDEO_NV12_BT601, ps_kernel_nv12_bt601_static, false),
176	KERNEL(WM_KERNEL_VIDEO_PACKED_BT601, ps_kernel_packed_bt601_static, false),
177
178	KERNEL(WM_KERNEL_VIDEO_PLANAR_BT709, ps_kernel_planar_bt709_static, false),
179	KERNEL(WM_KERNEL_VIDEO_NV12_BT709, ps_kernel_nv12_bt709_static, false),
180	KERNEL(WM_KERNEL_VIDEO_PACKED_BT709, ps_kernel_packed_bt709_static, false),
181};
182#undef KERNEL
183
184static const struct blendinfo {
185	bool src_alpha;
186	uint32_t src_blend;
187	uint32_t dst_blend;
188} gen4_blend_op[] = {
189	/* Clear */	{0, GEN4_BLENDFACTOR_ZERO, GEN4_BLENDFACTOR_ZERO},
190	/* Src */	{0, GEN4_BLENDFACTOR_ONE, GEN4_BLENDFACTOR_ZERO},
191	/* Dst */	{0, GEN4_BLENDFACTOR_ZERO, GEN4_BLENDFACTOR_ONE},
192	/* Over */	{1, GEN4_BLENDFACTOR_ONE, GEN4_BLENDFACTOR_INV_SRC_ALPHA},
193	/* OverReverse */ {0, GEN4_BLENDFACTOR_INV_DST_ALPHA, GEN4_BLENDFACTOR_ONE},
194	/* In */	{0, GEN4_BLENDFACTOR_DST_ALPHA, GEN4_BLENDFACTOR_ZERO},
195	/* InReverse */	{1, GEN4_BLENDFACTOR_ZERO, GEN4_BLENDFACTOR_SRC_ALPHA},
196	/* Out */	{0, GEN4_BLENDFACTOR_INV_DST_ALPHA, GEN4_BLENDFACTOR_ZERO},
197	/* OutReverse */ {1, GEN4_BLENDFACTOR_ZERO, GEN4_BLENDFACTOR_INV_SRC_ALPHA},
198	/* Atop */	{1, GEN4_BLENDFACTOR_DST_ALPHA, GEN4_BLENDFACTOR_INV_SRC_ALPHA},
199	/* AtopReverse */ {1, GEN4_BLENDFACTOR_INV_DST_ALPHA, GEN4_BLENDFACTOR_SRC_ALPHA},
200	/* Xor */	{1, GEN4_BLENDFACTOR_INV_DST_ALPHA, GEN4_BLENDFACTOR_INV_SRC_ALPHA},
201	/* Add */	{0, GEN4_BLENDFACTOR_ONE, GEN4_BLENDFACTOR_ONE},
202};
203
204/**
205 * Highest-valued BLENDFACTOR used in gen4_blend_op.
206 *
207 * This leaves out GEN4_BLENDFACTOR_INV_DST_COLOR,
208 * GEN4_BLENDFACTOR_INV_CONST_{COLOR,ALPHA},
209 * GEN4_BLENDFACTOR_INV_SRC1_{COLOR,ALPHA}
210 */
211#define GEN4_BLENDFACTOR_COUNT (GEN4_BLENDFACTOR_INV_DST_ALPHA + 1)
212
213#define BLEND_OFFSET(s, d) \
214	(((s) * GEN4_BLENDFACTOR_COUNT + (d)) * 64)
215
216#define SAMPLER_OFFSET(sf, se, mf, me, k) \
217	((((((sf) * EXTEND_COUNT + (se)) * FILTER_COUNT + (mf)) * EXTEND_COUNT + (me)) * KERNEL_COUNT + (k)) * 64)
218
219static void
220gen4_emit_pipelined_pointers(struct sna *sna,
221			     const struct sna_composite_op *op,
222			     int blend, int kernel);
223
224#define OUT_BATCH(v) batch_emit(sna, v)
225#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
226#define OUT_VERTEX_F(v) vertex_emit(sna, v)
227
228#define GEN4_MAX_3D_SIZE 8192
229
230static inline bool too_large(int width, int height)
231{
232	return width > GEN4_MAX_3D_SIZE || height > GEN4_MAX_3D_SIZE;
233}
234
235static int
236gen4_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
237{
238	int base;
239
240	if (has_mask) {
241		if (is_ca) {
242			if (gen4_blend_op[op].src_alpha)
243				base = WM_KERNEL_MASKSA;
244			else
245				base = WM_KERNEL_MASKCA;
246		} else
247			base = WM_KERNEL_MASK;
248	} else
249		base = WM_KERNEL;
250
251	return base + !is_affine;
252}
253
254static bool gen4_magic_ca_pass(struct sna *sna,
255			       const struct sna_composite_op *op)
256{
257	struct gen4_render_state *state = &sna->render_state.gen4;
258
259	if (!op->need_magic_ca_pass)
260		return false;
261
262	assert(sna->render.vertex_index > sna->render.vertex_start);
263
264	DBG(("%s: CA fixup\n", __FUNCTION__));
265	assert(op->mask.bo != NULL);
266	assert(op->has_component_alpha);
267
268	gen4_emit_pipelined_pointers(sna, op, PictOpAdd,
269				     gen4_choose_composite_kernel(PictOpAdd,
270								  true, true, op->is_affine));
271
272	OUT_BATCH(GEN4_3DPRIMITIVE |
273		  GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL |
274		  (_3DPRIM_RECTLIST << GEN4_3DPRIMITIVE_TOPOLOGY_SHIFT) |
275		  (0 << 9) |
276		  4);
277	OUT_BATCH(sna->render.vertex_index - sna->render.vertex_start);
278	OUT_BATCH(sna->render.vertex_start);
279	OUT_BATCH(1);	/* single instance */
280	OUT_BATCH(0);	/* start instance location */
281	OUT_BATCH(0);	/* index buffer offset, ignored */
282
283	state->last_primitive = sna->kgem.nbatch;
284	return true;
285}
286
287static uint32_t gen4_get_blend(int op,
288			       bool has_component_alpha,
289			       uint32_t dst_format)
290{
291	uint32_t src, dst;
292
293	src = gen4_blend_op[op].src_blend;
294	dst = gen4_blend_op[op].dst_blend;
295
296	/* If there's no dst alpha channel, adjust the blend op so that we'll treat
297	 * it as always 1.
298	 */
299	if (PICT_FORMAT_A(dst_format) == 0) {
300		if (src == GEN4_BLENDFACTOR_DST_ALPHA)
301			src = GEN4_BLENDFACTOR_ONE;
302		else if (src == GEN4_BLENDFACTOR_INV_DST_ALPHA)
303			src = GEN4_BLENDFACTOR_ZERO;
304	}
305
306	/* If the source alpha is being used, then we should only be in a
307	 * case where the source blend factor is 0, and the source blend
308	 * value is the mask channels multiplied by the source picture's alpha.
309	 */
310	if (has_component_alpha && gen4_blend_op[op].src_alpha) {
311		if (dst == GEN4_BLENDFACTOR_SRC_ALPHA)
312			dst = GEN4_BLENDFACTOR_SRC_COLOR;
313		else if (dst == GEN4_BLENDFACTOR_INV_SRC_ALPHA)
314			dst = GEN4_BLENDFACTOR_INV_SRC_COLOR;
315	}
316
317	DBG(("blend op=%d, dst=%x [A=%d] => src=%d, dst=%d => offset=%x\n",
318	     op, dst_format, PICT_FORMAT_A(dst_format),
319	     src, dst, BLEND_OFFSET(src, dst)));
320	return BLEND_OFFSET(src, dst);
321}
322
323static uint32_t gen4_get_card_format(PictFormat format)
324{
325	switch (format) {
326	default:
327		return -1;
328	case PICT_a8r8g8b8:
329		return GEN4_SURFACEFORMAT_B8G8R8A8_UNORM;
330	case PICT_x8r8g8b8:
331		return GEN4_SURFACEFORMAT_B8G8R8X8_UNORM;
332	case PICT_a8b8g8r8:
333		return GEN4_SURFACEFORMAT_R8G8B8A8_UNORM;
334	case PICT_x8b8g8r8:
335		return GEN4_SURFACEFORMAT_R8G8B8X8_UNORM;
336#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
337	case PICT_a2r10g10b10:
338		return GEN4_SURFACEFORMAT_B10G10R10A2_UNORM;
339	case PICT_x2r10g10b10:
340		return GEN4_SURFACEFORMAT_B10G10R10X2_UNORM;
341#endif
342	case PICT_r8g8b8:
343		return GEN4_SURFACEFORMAT_R8G8B8_UNORM;
344	case PICT_r5g6b5:
345		return GEN4_SURFACEFORMAT_B5G6R5_UNORM;
346	case PICT_a1r5g5b5:
347		return GEN4_SURFACEFORMAT_B5G5R5A1_UNORM;
348	case PICT_a8:
349		return GEN4_SURFACEFORMAT_A8_UNORM;
350	case PICT_a4r4g4b4:
351		return GEN4_SURFACEFORMAT_B4G4R4A4_UNORM;
352	}
353}
354
355static uint32_t gen4_get_dest_format(PictFormat format)
356{
357	switch (format) {
358	default:
359		return -1;
360	case PICT_a8r8g8b8:
361	case PICT_x8r8g8b8:
362		return GEN4_SURFACEFORMAT_B8G8R8A8_UNORM;
363	case PICT_a8b8g8r8:
364	case PICT_x8b8g8r8:
365		return GEN4_SURFACEFORMAT_R8G8B8A8_UNORM;
366#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
367	case PICT_a2r10g10b10:
368	case PICT_x2r10g10b10:
369		return GEN4_SURFACEFORMAT_B10G10R10A2_UNORM;
370#endif
371	case PICT_r5g6b5:
372		return GEN4_SURFACEFORMAT_B5G6R5_UNORM;
373	case PICT_x1r5g5b5:
374	case PICT_a1r5g5b5:
375		return GEN4_SURFACEFORMAT_B5G5R5A1_UNORM;
376	case PICT_a8:
377		return GEN4_SURFACEFORMAT_A8_UNORM;
378	case PICT_a4r4g4b4:
379	case PICT_x4r4g4b4:
380		return GEN4_SURFACEFORMAT_B4G4R4A4_UNORM;
381	}
382}
383
384static bool gen4_check_dst_format(PictFormat format)
385{
386	if (gen4_get_dest_format(format) != -1)
387		return true;
388
389	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
390	return false;
391}
392
393static bool gen4_check_format(uint32_t format)
394{
395	if (gen4_get_card_format(format) != -1)
396		return true;
397
398	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
399	return false;
400}
401
402typedef struct gen4_surface_state_padded {
403	struct gen4_surface_state state;
404	char pad[32 - sizeof(struct gen4_surface_state)];
405} gen4_surface_state_padded;
406
407static void null_create(struct sna_static_stream *stream)
408{
409	/* A bunch of zeros useful for legacy border color and depth-stencil */
410	sna_static_stream_map(stream, 64, 64);
411}
412
413static void
414sampler_state_init(struct gen4_sampler_state *sampler_state,
415		   sampler_filter_t filter,
416		   sampler_extend_t extend)
417{
418	sampler_state->ss0.lod_preclamp = 1;	/* GL mode */
419
420	/* We use the legacy mode to get the semantics specified by
421	 * the Render extension. */
422	sampler_state->ss0.border_color_mode = GEN4_BORDER_COLOR_MODE_LEGACY;
423
424	switch (filter) {
425	default:
426	case SAMPLER_FILTER_NEAREST:
427		sampler_state->ss0.min_filter = GEN4_MAPFILTER_NEAREST;
428		sampler_state->ss0.mag_filter = GEN4_MAPFILTER_NEAREST;
429		break;
430	case SAMPLER_FILTER_BILINEAR:
431		sampler_state->ss0.min_filter = GEN4_MAPFILTER_LINEAR;
432		sampler_state->ss0.mag_filter = GEN4_MAPFILTER_LINEAR;
433		break;
434	}
435
436	switch (extend) {
437	default:
438	case SAMPLER_EXTEND_NONE:
439		sampler_state->ss1.r_wrap_mode = GEN4_TEXCOORDMODE_CLAMP_BORDER;
440		sampler_state->ss1.s_wrap_mode = GEN4_TEXCOORDMODE_CLAMP_BORDER;
441		sampler_state->ss1.t_wrap_mode = GEN4_TEXCOORDMODE_CLAMP_BORDER;
442		break;
443	case SAMPLER_EXTEND_REPEAT:
444		sampler_state->ss1.r_wrap_mode = GEN4_TEXCOORDMODE_WRAP;
445		sampler_state->ss1.s_wrap_mode = GEN4_TEXCOORDMODE_WRAP;
446		sampler_state->ss1.t_wrap_mode = GEN4_TEXCOORDMODE_WRAP;
447		break;
448	case SAMPLER_EXTEND_PAD:
449		sampler_state->ss1.r_wrap_mode = GEN4_TEXCOORDMODE_CLAMP;
450		sampler_state->ss1.s_wrap_mode = GEN4_TEXCOORDMODE_CLAMP;
451		sampler_state->ss1.t_wrap_mode = GEN4_TEXCOORDMODE_CLAMP;
452		break;
453	case SAMPLER_EXTEND_REFLECT:
454		sampler_state->ss1.r_wrap_mode = GEN4_TEXCOORDMODE_MIRROR;
455		sampler_state->ss1.s_wrap_mode = GEN4_TEXCOORDMODE_MIRROR;
456		sampler_state->ss1.t_wrap_mode = GEN4_TEXCOORDMODE_MIRROR;
457		break;
458	}
459}
460
461static uint32_t gen4_filter(uint32_t filter)
462{
463	switch (filter) {
464	default:
465		assert(0);
466	case PictFilterNearest:
467		return SAMPLER_FILTER_NEAREST;
468	case PictFilterBilinear:
469		return SAMPLER_FILTER_BILINEAR;
470	}
471}
472
473static uint32_t gen4_check_filter(PicturePtr picture)
474{
475	switch (picture->filter) {
476	case PictFilterNearest:
477	case PictFilterBilinear:
478		return true;
479	default:
480		DBG(("%s: unknown filter: %s [%d]\n",
481		     __FUNCTION__,
482		     PictureGetFilterName(picture->filter),
483		     picture->filter));
484		return false;
485	}
486}
487
488static uint32_t gen4_repeat(uint32_t repeat)
489{
490	switch (repeat) {
491	default:
492		assert(0);
493	case RepeatNone:
494		return SAMPLER_EXTEND_NONE;
495	case RepeatNormal:
496		return SAMPLER_EXTEND_REPEAT;
497	case RepeatPad:
498		return SAMPLER_EXTEND_PAD;
499	case RepeatReflect:
500		return SAMPLER_EXTEND_REFLECT;
501	}
502}
503
504static bool gen4_check_repeat(PicturePtr picture)
505{
506	if (!picture->repeat)
507		return true;
508
509	switch (picture->repeatType) {
510	case RepeatNone:
511	case RepeatNormal:
512	case RepeatPad:
513	case RepeatReflect:
514		return true;
515	default:
516		DBG(("%s: unknown repeat: %d\n",
517		     __FUNCTION__, picture->repeatType));
518		return false;
519	}
520}
521
522static uint32_t
523gen4_tiling_bits(uint32_t tiling)
524{
525	switch (tiling) {
526	default: assert(0);
527	case I915_TILING_NONE: return 0;
528	case I915_TILING_X: return GEN4_SURFACE_TILED;
529	case I915_TILING_Y: return GEN4_SURFACE_TILED | GEN4_SURFACE_TILED_Y;
530	}
531}
532
533/**
534 * Sets up the common fields for a surface state buffer for the given
535 * picture in the given surface state buffer.
536 */
537static uint32_t
538gen4_bind_bo(struct sna *sna,
539	     struct kgem_bo *bo,
540	     uint32_t width,
541	     uint32_t height,
542	     uint32_t format,
543	     bool is_dst)
544{
545	uint32_t domains;
546	uint16_t offset;
547	uint32_t *ss;
548
549	assert(sna->kgem.gen != 040 || !kgem_bo_is_snoop(bo));
550
551	/* After the first bind, we manage the cache domains within the batch */
552	offset = kgem_bo_get_binding(bo, format | is_dst << 31);
553	if (offset) {
554		assert(offset >= sna->kgem.surface);
555		if (is_dst)
556			kgem_bo_mark_dirty(bo);
557		return offset * sizeof(uint32_t);
558	}
559
560	offset = sna->kgem.surface -=
561		sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
562	ss = sna->kgem.batch + offset;
563
564	ss[0] = (GEN4_SURFACE_2D << GEN4_SURFACE_TYPE_SHIFT |
565		 GEN4_SURFACE_BLEND_ENABLED |
566		 format << GEN4_SURFACE_FORMAT_SHIFT);
567
568	if (is_dst) {
569		ss[0] |= GEN4_SURFACE_RC_READ_WRITE;
570		domains = I915_GEM_DOMAIN_RENDER << 16 | I915_GEM_DOMAIN_RENDER;
571	} else
572		domains = I915_GEM_DOMAIN_SAMPLER << 16;
573	ss[1] = kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0);
574
575	ss[2] = ((width - 1)  << GEN4_SURFACE_WIDTH_SHIFT |
576		 (height - 1) << GEN4_SURFACE_HEIGHT_SHIFT);
577	ss[3] = (gen4_tiling_bits(bo->tiling) |
578		 (bo->pitch - 1) << GEN4_SURFACE_PITCH_SHIFT);
579	ss[4] = 0;
580	ss[5] = 0;
581
582	kgem_bo_set_binding(bo, format | is_dst << 31, offset);
583
584	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
585	     offset, bo->handle, ss[1],
586	     format, width, height, bo->pitch, bo->tiling,
587	     domains & 0xffff ? "render" : "sampler"));
588
589	return offset * sizeof(uint32_t);
590}
591
592static void gen4_emit_vertex_buffer(struct sna *sna,
593				    const struct sna_composite_op *op)
594{
595	int id = op->u.gen4.ve_id;
596
597	assert((sna->render.vb_id & (1 << id)) == 0);
598
599	OUT_BATCH(GEN4_3DSTATE_VERTEX_BUFFERS | 3);
600	OUT_BATCH((id << VB0_BUFFER_INDEX_SHIFT) | VB0_VERTEXDATA |
601		  (4*op->floats_per_vertex << VB0_BUFFER_PITCH_SHIFT));
602	assert(sna->render.nvertex_reloc < ARRAY_SIZE(sna->render.vertex_reloc));
603	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
604	OUT_BATCH(0);
605	OUT_BATCH(0);
606	OUT_BATCH(0);
607
608	sna->render.vb_id |= 1 << id;
609}
610
611inline static void
612gen4_emit_pipe_flush(struct sna *sna)
613{
614#if 1
615	OUT_BATCH(GEN4_PIPE_CONTROL |
616		  GEN4_PIPE_CONTROL_WC_FLUSH |
617		  (4 - 2));
618	OUT_BATCH(0);
619	OUT_BATCH(0);
620	OUT_BATCH(0);
621#else
622	OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
623#endif
624}
625
626inline static void
627gen4_emit_pipe_break(struct sna *sna)
628{
629#if !ALWAYS_FLUSH
630	OUT_BATCH(GEN4_PIPE_CONTROL | (4 - 2));
631	OUT_BATCH(0);
632	OUT_BATCH(0);
633	OUT_BATCH(0);
634#else
635	OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
636#endif
637}
638
639inline static void
640gen4_emit_pipe_invalidate(struct sna *sna)
641{
642#if 0
643	OUT_BATCH(GEN4_PIPE_CONTROL |
644		  GEN4_PIPE_CONTROL_WC_FLUSH |
645		  (sna->kgem.gen >= 045 ? GEN4_PIPE_CONTROL_TC_FLUSH : 0) |
646		  (4 - 2));
647	OUT_BATCH(0);
648	OUT_BATCH(0);
649	OUT_BATCH(0);
650#else
651	OUT_BATCH(MI_FLUSH);
652#endif
653}
654
655static void gen4_emit_primitive(struct sna *sna)
656{
657	if (sna->kgem.nbatch == sna->render_state.gen4.last_primitive) {
658		sna->render.vertex_offset = sna->kgem.nbatch - 5;
659		return;
660	}
661
662	OUT_BATCH(GEN4_3DPRIMITIVE |
663		  GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL |
664		  (_3DPRIM_RECTLIST << GEN4_3DPRIMITIVE_TOPOLOGY_SHIFT) |
665		  (0 << 9) |
666		  4);
667	sna->render.vertex_offset = sna->kgem.nbatch;
668	OUT_BATCH(0);	/* vertex count, to be filled in later */
669	OUT_BATCH(sna->render.vertex_index);
670	OUT_BATCH(1);	/* single instance */
671	OUT_BATCH(0);	/* start instance location */
672	OUT_BATCH(0);	/* index buffer offset, ignored */
673	sna->render.vertex_start = sna->render.vertex_index;
674
675	sna->render_state.gen4.last_primitive = sna->kgem.nbatch;
676}
677
678static bool gen4_rectangle_begin(struct sna *sna,
679				 const struct sna_composite_op *op)
680{
681	unsigned int id = 1 << op->u.gen4.ve_id;
682	int ndwords;
683
684	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
685		return true;
686
687	/* 7xpipelined pointers + 6xprimitive + 1xflush */
688	ndwords = op->need_magic_ca_pass? 19 : 6;
689	if ((sna->render.vb_id & id) == 0)
690		ndwords += 5;
691	ndwords += 8*FORCE_FLUSH;
692
693	if (!kgem_check_batch(&sna->kgem, ndwords))
694		return false;
695
696	if ((sna->render.vb_id & id) == 0)
697		gen4_emit_vertex_buffer(sna, op);
698	if (sna->render.vertex_offset == 0)
699		gen4_emit_primitive(sna);
700
701	return true;
702}
703
704static int gen4_get_rectangles__flush(struct sna *sna,
705				      const struct sna_composite_op *op)
706{
707	/* Preventing discarding new vbo after lock contention */
708	if (sna_vertex_wait__locked(&sna->render)) {
709		int rem = vertex_space(sna);
710		if (rem > op->floats_per_rect)
711			return rem;
712	}
713
714	if (!kgem_check_batch(&sna->kgem,
715			      8*FORCE_FLUSH + (op->need_magic_ca_pass ? 2*19+6 : 6)))
716		return 0;
717	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
718		return 0;
719
720	if (sna->render.vertex_offset) {
721		gen4_vertex_flush(sna);
722		if (gen4_magic_ca_pass(sna, op))
723			gen4_emit_pipelined_pointers(sna, op, op->op,
724						     op->u.gen4.wm_kernel);
725	}
726
727	return gen4_vertex_finish(sna);
728}
729
730inline static int gen4_get_rectangles(struct sna *sna,
731				      const struct sna_composite_op *op,
732				      int want,
733				      void (*emit_state)(struct sna *sna, const struct sna_composite_op *op))
734{
735	int rem;
736
737	assert(want);
738#if FORCE_FLUSH
739	rem = sna->render.vertex_offset;
740	if (sna->kgem.nbatch == sna->render_state.gen4.last_primitive)
741		rem = sna->kgem.nbatch - 5;
742	if (rem) {
743		rem = MAX_FLUSH_VERTICES - (sna->render.vertex_index - sna->render.vertex_start) / 3;
744		if (rem <= 0) {
745			if (sna->render.vertex_offset) {
746				gen4_vertex_flush(sna);
747				if (gen4_magic_ca_pass(sna, op)) {
748					if (kgem_check_batch(&sna->kgem, 19+6))
749						gen4_emit_pipelined_pointers(sna, op, op->op,
750									     op->u.gen4.wm_kernel);
751				}
752			}
753			gen4_emit_pipe_break(sna);
754			rem = MAX_FLUSH_VERTICES;
755		}
756	} else
757		rem = MAX_FLUSH_VERTICES;
758	if (want > rem)
759		want = rem;
760#endif
761
762start:
763	rem = vertex_space(sna);
764	if (unlikely(rem < op->floats_per_rect)) {
765		DBG(("flushing vbo for %s: %d < %d\n",
766		     __FUNCTION__, rem, op->floats_per_rect));
767		rem = gen4_get_rectangles__flush(sna, op);
768		if (unlikely(rem == 0))
769			goto flush;
770	}
771
772	if (unlikely(sna->render.vertex_offset == 0)) {
773		if (!gen4_rectangle_begin(sna, op))
774			goto flush;
775		else
776			goto start;
777	}
778
779	assert(rem <= vertex_space(sna));
780	assert(op->floats_per_rect <= rem);
781	if (want > 1 && want * op->floats_per_rect > rem)
782		want = rem / op->floats_per_rect;
783
784	sna->render.vertex_index += 3*want;
785	return want;
786
787flush:
788	if (sna->render.vertex_offset) {
789		gen4_vertex_flush(sna);
790		gen4_magic_ca_pass(sna, op);
791	}
792	sna_vertex_wait__locked(&sna->render);
793	_kgem_submit(&sna->kgem);
794	emit_state(sna, op);
795	goto start;
796}
797
798static uint32_t *
799gen4_composite_get_binding_table(struct sna *sna, uint16_t *offset)
800{
801	sna->kgem.surface -=
802		sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
803
804	DBG(("%s(%x)\n", __FUNCTION__, 4*sna->kgem.surface));
805
806	/* Clear all surplus entries to zero in case of prefetch */
807	*offset = sna->kgem.surface;
808	return memset(sna->kgem.batch + sna->kgem.surface,
809		      0, sizeof(struct gen4_surface_state_padded));
810}
811
812static void
813gen4_emit_urb(struct sna *sna)
814{
815	int urb_vs_end;
816	int urb_gs_end;
817	int urb_cl_end;
818	int urb_sf_end;
819	int urb_cs_end;
820
821	if (!sna->render_state.gen4.needs_urb)
822		return;
823
824	urb_vs_end =              URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
825	urb_gs_end = urb_vs_end + URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
826	urb_cl_end = urb_gs_end + URB_CL_ENTRIES * URB_CL_ENTRY_SIZE;
827	urb_sf_end = urb_cl_end + URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
828	urb_cs_end = urb_sf_end + URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
829	assert(urb_cs_end <= 256);
830
831	while ((sna->kgem.nbatch & 15) > 12)
832		OUT_BATCH(MI_NOOP);
833
834	OUT_BATCH(GEN4_URB_FENCE |
835		  UF0_CS_REALLOC |
836		  UF0_SF_REALLOC |
837		  UF0_CLIP_REALLOC |
838		  UF0_GS_REALLOC |
839		  UF0_VS_REALLOC |
840		  1);
841	OUT_BATCH(urb_cl_end << UF1_CLIP_FENCE_SHIFT |
842		  urb_gs_end << UF1_GS_FENCE_SHIFT |
843		  urb_vs_end << UF1_VS_FENCE_SHIFT);
844	OUT_BATCH(urb_cs_end << UF2_CS_FENCE_SHIFT |
845		  urb_sf_end << UF2_SF_FENCE_SHIFT);
846
847	/* Constant buffer state */
848	OUT_BATCH(GEN4_CS_URB_STATE | 0);
849	OUT_BATCH((URB_CS_ENTRY_SIZE - 1) << 4 | URB_CS_ENTRIES << 0);
850
851	sna->render_state.gen4.needs_urb = false;
852}
853
854static void
855gen4_emit_state_base_address(struct sna *sna)
856{
857	assert(sna->render_state.gen4.general_bo->proxy == NULL);
858	OUT_BATCH(GEN4_STATE_BASE_ADDRESS | 4);
859	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* general */
860				 sna->kgem.nbatch,
861				 sna->render_state.gen4.general_bo,
862				 I915_GEM_DOMAIN_INSTRUCTION << 16,
863				 BASE_ADDRESS_MODIFY));
864	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* surface */
865				 sna->kgem.nbatch,
866				 NULL,
867				 I915_GEM_DOMAIN_INSTRUCTION << 16,
868				 BASE_ADDRESS_MODIFY));
869	OUT_BATCH(0); /* media */
870
871	/* upper bounds, all disabled */
872	OUT_BATCH(BASE_ADDRESS_MODIFY);
873	OUT_BATCH(0);
874}
875
876static void
877gen4_emit_invariant(struct sna *sna)
878{
879	assert(sna->kgem.surface == sna->kgem.batch_size);
880
881	if (sna->kgem.gen >= 045)
882		OUT_BATCH(NEW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
883	else
884		OUT_BATCH(GEN4_PIPELINE_SELECT | PIPELINE_SELECT_3D);
885
886	gen4_emit_state_base_address(sna);
887
888	sna->render_state.gen4.needs_invariant = false;
889}
890
891static void
892gen4_get_batch(struct sna *sna, const struct sna_composite_op *op)
893{
894	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
895
896	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150 + 50*FORCE_FLUSH, 4)) {
897		DBG(("%s: flushing batch: %d < %d+%d\n",
898		     __FUNCTION__, sna->kgem.surface - sna->kgem.nbatch,
899		     150, 4*8));
900		kgem_submit(&sna->kgem);
901		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
902	}
903
904	if (sna->render_state.gen4.needs_invariant)
905		gen4_emit_invariant(sna);
906}
907
908static void
909gen4_align_vertex(struct sna *sna, const struct sna_composite_op *op)
910{
911	assert(op->floats_per_rect == 3*op->floats_per_vertex);
912	if (op->floats_per_vertex != sna->render_state.gen4.floats_per_vertex) {
913		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
914		     sna->render_state.gen4.floats_per_vertex,
915		     op->floats_per_vertex));
916		gen4_vertex_align(sna, op);
917		sna->render_state.gen4.floats_per_vertex = op->floats_per_vertex;
918	}
919}
920
921static void
922gen4_emit_binding_table(struct sna *sna, uint16_t offset)
923{
924	if (sna->render_state.gen4.surface_table == offset)
925		return;
926
927	sna->render_state.gen4.surface_table = offset;
928
929	/* Binding table pointers */
930	OUT_BATCH(GEN4_3DSTATE_BINDING_TABLE_POINTERS | 4);
931	OUT_BATCH(0);		/* vs */
932	OUT_BATCH(0);		/* gs */
933	OUT_BATCH(0);		/* clip */
934	OUT_BATCH(0);		/* sf */
935	/* Only the PS uses the binding table */
936	OUT_BATCH(offset*4);
937}
938
939static void
940gen4_emit_pipelined_pointers(struct sna *sna,
941			     const struct sna_composite_op *op,
942			     int blend, int kernel)
943{
944	uint16_t sp, bp;
945	uint32_t key;
946
947	DBG(("%s: has_mask=%d, src=(%d, %d), mask=(%d, %d),kernel=%d, blend=%d, ca=%d, format=%x\n",
948	     __FUNCTION__, op->u.gen4.ve_id & 2,
949	     op->src.filter, op->src.repeat,
950	     op->mask.filter, op->mask.repeat,
951	     kernel, blend, op->has_component_alpha, (int)op->dst.format));
952
953	sp = SAMPLER_OFFSET(op->src.filter, op->src.repeat,
954			    op->mask.filter, op->mask.repeat,
955			    kernel);
956	bp = gen4_get_blend(blend, op->has_component_alpha, op->dst.format);
957
958	DBG(("%s: sp=%d, bp=%d\n", __FUNCTION__, sp, bp));
959	key = sp | (uint32_t)bp << 16;
960	if (key == sna->render_state.gen4.last_pipelined_pointers)
961		return;
962
963	OUT_BATCH(GEN4_3DSTATE_PIPELINED_POINTERS | 5);
964	OUT_BATCH(sna->render_state.gen4.vs);
965	OUT_BATCH(GEN4_GS_DISABLE); /* passthrough */
966	OUT_BATCH(GEN4_CLIP_DISABLE); /* passthrough */
967	OUT_BATCH(sna->render_state.gen4.sf);
968	OUT_BATCH(sna->render_state.gen4.wm + sp);
969	OUT_BATCH(sna->render_state.gen4.cc + bp);
970
971	sna->render_state.gen4.last_pipelined_pointers = key;
972	gen4_emit_urb(sna);
973}
974
975static bool
976gen4_emit_drawing_rectangle(struct sna *sna, const struct sna_composite_op *op)
977{
978	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
979	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
980
981	assert(!too_large(abs(op->dst.x), abs(op->dst.y)));
982	assert(!too_large(op->dst.width, op->dst.height));
983
984	if (sna->render_state.gen4.drawrect_limit == limit &&
985	    sna->render_state.gen4.drawrect_offset == offset)
986		return true;
987
988	sna->render_state.gen4.drawrect_offset = offset;
989	sna->render_state.gen4.drawrect_limit = limit;
990
991	OUT_BATCH(GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
992	OUT_BATCH(0);
993	OUT_BATCH(limit);
994	OUT_BATCH(offset);
995	return false;
996}
997
998static void
999gen4_emit_vertex_elements(struct sna *sna,
1000			  const struct sna_composite_op *op)
1001{
1002	/*
1003	 * vertex data in vertex buffer
1004	 *    position: (x, y)
1005	 *    texture coordinate 0: (u0, v0) if (is_affine is true) else (u0, v0, w0)
1006	 *    texture coordinate 1 if (has_mask is true): same as above
1007	 */
1008	struct gen4_render_state *render = &sna->render_state.gen4;
1009	uint32_t src_format, dw;
1010	int id = op->u.gen4.ve_id;
1011
1012	if (render->ve_id == id)
1013		return;
1014	render->ve_id = id;
1015
1016	/* The VUE layout
1017	 *    dword 0-3: position (x, y, 1.0, 1.0),
1018	 *    dword 4-7: texture coordinate 0 (u0, v0, w0, 1.0)
1019	 *    [optional] dword 8-11: texture coordinate 1 (u1, v1, w1, 1.0)
1020	 */
1021	OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS | (2 * (1 + 2) - 1));
1022
1023	/* x,y */
1024	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1025		  GEN4_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
1026		  0 << VE0_OFFSET_SHIFT);
1027	OUT_BATCH(VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
1028		  VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
1029		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
1030		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
1031		  (1*4) << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
1032
1033	/* u0, v0, w0 */
1034	/* u0, v0, w0 */
1035	DBG(("%s: first channel %d floats, offset=4b\n", __FUNCTION__, id & 3));
1036	dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
1037	switch (id & 3) {
1038	default:
1039		assert(0);
1040	case 0:
1041		src_format = GEN4_SURFACEFORMAT_R16G16_SSCALED;
1042		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1043		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1044		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1045		break;
1046	case 1:
1047		src_format = GEN4_SURFACEFORMAT_R32_FLOAT;
1048		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1049		dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
1050		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1051		break;
1052	case 2:
1053		src_format = GEN4_SURFACEFORMAT_R32G32_FLOAT;
1054		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1055		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1056		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1057		break;
1058	case 3:
1059		src_format = GEN4_SURFACEFORMAT_R32G32B32_FLOAT;
1060		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1061		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1062		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
1063		break;
1064	}
1065	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1066		  src_format << VE0_FORMAT_SHIFT |
1067		  4 << VE0_OFFSET_SHIFT);
1068	OUT_BATCH(dw | 8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
1069
1070	/* u1, v1, w1 */
1071	if (id >> 2) {
1072		unsigned src_offset = 4 + ((id & 3) ?: 1) * sizeof(float);
1073		DBG(("%s: second channel %d floats, offset=%db\n", __FUNCTION__,
1074		     id >> 2, src_offset));
1075		dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
1076		switch (id >> 2) {
1077		case 1:
1078			src_format = GEN4_SURFACEFORMAT_R32_FLOAT;
1079			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1080			dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
1081			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1082			break;
1083		default:
1084			assert(0);
1085		case 2:
1086			src_format = GEN4_SURFACEFORMAT_R32G32_FLOAT;
1087			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1088			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1089			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1090			break;
1091		case 3:
1092			src_format = GEN4_SURFACEFORMAT_R32G32B32_FLOAT;
1093			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1094			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1095			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
1096			break;
1097		}
1098		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1099			  src_format << VE0_FORMAT_SHIFT |
1100			  src_offset << VE0_OFFSET_SHIFT);
1101		OUT_BATCH(dw | 12 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
1102	} else {
1103		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1104			  GEN4_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
1105			  0 << VE0_OFFSET_SHIFT);
1106		OUT_BATCH(VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
1107			  VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
1108			  VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
1109			  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
1110			  12 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
1111	}
1112}
1113
1114static void
1115gen4_emit_state(struct sna *sna,
1116		const struct sna_composite_op *op,
1117		uint16_t wm_binding_table)
1118{
1119	bool flush;
1120
1121	assert(op->dst.bo->exec);
1122
1123	flush = wm_binding_table & 1;
1124	wm_binding_table &= ~1;
1125
1126	if (ALWAYS_FLUSH || kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
1127		DBG(("%s: flushing dirty (%d, %d), forced? %d\n", __FUNCTION__,
1128		     kgem_bo_is_dirty(op->src.bo),
1129		     kgem_bo_is_dirty(op->mask.bo),
1130		     flush));
1131		gen4_emit_pipe_invalidate(sna);
1132		kgem_clear_dirty(&sna->kgem);
1133		kgem_bo_mark_dirty(op->dst.bo);
1134		flush = false;
1135	}
1136	flush &= gen4_emit_drawing_rectangle(sna, op);
1137	if (flush && op->op > PictOpSrc)
1138		gen4_emit_pipe_flush(sna);
1139
1140	gen4_emit_binding_table(sna, wm_binding_table);
1141	gen4_emit_pipelined_pointers(sna, op, op->op, op->u.gen4.wm_kernel);
1142	gen4_emit_vertex_elements(sna, op);
1143}
1144
1145static void
1146gen4_bind_surfaces(struct sna *sna,
1147		   const struct sna_composite_op *op)
1148{
1149	uint32_t *binding_table;
1150	uint16_t offset, dirty;
1151
1152	gen4_get_batch(sna, op);
1153	dirty = kgem_bo_is_dirty(op->dst.bo);
1154
1155	binding_table = gen4_composite_get_binding_table(sna, &offset);
1156
1157	binding_table[0] =
1158		gen4_bind_bo(sna,
1159			    op->dst.bo, op->dst.width, op->dst.height,
1160			    gen4_get_dest_format(op->dst.format),
1161			    true);
1162	binding_table[1] =
1163		gen4_bind_bo(sna,
1164			     op->src.bo, op->src.width, op->src.height,
1165			     op->src.card_format,
1166			     false);
1167	if (op->mask.bo) {
1168		assert(op->u.gen4.ve_id >> 2);
1169		binding_table[2] =
1170			gen4_bind_bo(sna,
1171				     op->mask.bo,
1172				     op->mask.width,
1173				     op->mask.height,
1174				     op->mask.card_format,
1175				     false);
1176	}
1177
1178	if (sna->kgem.surface == offset &&
1179	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen4.surface_table) == *(uint64_t*)binding_table &&
1180	    (op->mask.bo == NULL ||
1181	     sna->kgem.batch[sna->render_state.gen4.surface_table+2] == binding_table[2])) {
1182		sna->kgem.surface += sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
1183		offset = sna->render_state.gen4.surface_table;
1184	}
1185
1186	if (!ALWAYS_FLUSH && sna->kgem.batch[sna->render_state.gen4.surface_table] == binding_table[0])
1187		dirty = 0;
1188
1189	gen4_emit_state(sna, op, offset | dirty);
1190}
1191
1192fastcall static void
1193gen4_render_composite_blt(struct sna *sna,
1194			  const struct sna_composite_op *op,
1195			  const struct sna_composite_rectangles *r)
1196{
1197	DBG(("%s: src=(%d, %d)+(%d, %d), mask=(%d, %d)+(%d, %d), dst=(%d, %d)+(%d, %d), size=(%d, %d)\n",
1198	     __FUNCTION__,
1199	     r->src.x, r->src.y, op->src.offset[0], op->src.offset[1],
1200	     r->mask.x, r->mask.y, op->mask.offset[0], op->mask.offset[1],
1201	     r->dst.x, r->dst.y, op->dst.x, op->dst.y,
1202	     r->width, r->height));
1203
1204	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
1205	op->prim_emit(sna, op, r);
1206}
1207
1208fastcall static void
1209gen4_render_composite_box(struct sna *sna,
1210			  const struct sna_composite_op *op,
1211			  const BoxRec *box)
1212{
1213	struct sna_composite_rectangles r;
1214
1215	DBG(("  %s: (%d, %d), (%d, %d)\n",
1216	     __FUNCTION__,
1217	     box->x1, box->y1, box->x2, box->y2));
1218
1219	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
1220
1221	r.dst.x = box->x1;
1222	r.dst.y = box->y1;
1223	r.width  = box->x2 - box->x1;
1224	r.height = box->y2 - box->y1;
1225	r.mask = r.src = r.dst;
1226
1227	op->prim_emit(sna, op, &r);
1228}
1229
1230static void
1231gen4_render_composite_boxes__blt(struct sna *sna,
1232				 const struct sna_composite_op *op,
1233				 const BoxRec *box, int nbox)
1234{
1235	DBG(("%s(%d) delta=(%d, %d), src=(%d, %d)/(%d, %d), mask=(%d, %d)/(%d, %d)\n",
1236	     __FUNCTION__, nbox, op->dst.x, op->dst.y,
1237	     op->src.offset[0], op->src.offset[1],
1238	     op->src.width, op->src.height,
1239	     op->mask.offset[0], op->mask.offset[1],
1240	     op->mask.width, op->mask.height));
1241
1242	do {
1243		int nbox_this_time;
1244
1245		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
1246						     gen4_bind_surfaces);
1247		nbox -= nbox_this_time;
1248
1249		do {
1250			struct sna_composite_rectangles r;
1251
1252			DBG(("  %s: (%d, %d), (%d, %d)\n",
1253			     __FUNCTION__,
1254			     box->x1, box->y1, box->x2, box->y2));
1255
1256			r.dst.x = box->x1;
1257			r.dst.y = box->y1;
1258			r.width  = box->x2 - box->x1;
1259			r.height = box->y2 - box->y1;
1260			r.mask = r.src = r.dst;
1261			op->prim_emit(sna, op, &r);
1262			box++;
1263		} while (--nbox_this_time);
1264	} while (nbox);
1265}
1266
1267static void
1268gen4_render_composite_boxes(struct sna *sna,
1269			    const struct sna_composite_op *op,
1270			    const BoxRec *box, int nbox)
1271{
1272	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1273
1274	do {
1275		int nbox_this_time;
1276		float *v;
1277
1278		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
1279						     gen4_bind_surfaces);
1280		assert(nbox_this_time);
1281		nbox -= nbox_this_time;
1282
1283		v = sna->render.vertices + sna->render.vertex_used;
1284		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1285
1286		op->emit_boxes(op, box, nbox_this_time, v);
1287		box += nbox_this_time;
1288	} while (nbox);
1289}
1290
1291#if !FORCE_FLUSH
1292static void
1293gen4_render_composite_boxes__thread(struct sna *sna,
1294				    const struct sna_composite_op *op,
1295				    const BoxRec *box, int nbox)
1296{
1297	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1298
1299	sna_vertex_lock(&sna->render);
1300	do {
1301		int nbox_this_time;
1302		float *v;
1303
1304		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
1305						     gen4_bind_surfaces);
1306		assert(nbox_this_time);
1307		nbox -= nbox_this_time;
1308
1309		v = sna->render.vertices + sna->render.vertex_used;
1310		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1311
1312		sna_vertex_acquire__locked(&sna->render);
1313		sna_vertex_unlock(&sna->render);
1314
1315		op->emit_boxes(op, box, nbox_this_time, v);
1316		box += nbox_this_time;
1317
1318		sna_vertex_lock(&sna->render);
1319		sna_vertex_release__locked(&sna->render);
1320	} while (nbox);
1321	sna_vertex_unlock(&sna->render);
1322}
1323#endif
1324
1325#ifndef MAX
1326#define MAX(a,b) ((a) > (b) ? (a) : (b))
1327#endif
1328
1329static uint32_t gen4_bind_video_source(struct sna *sna,
1330				       struct kgem_bo *src_bo,
1331				       uint32_t src_offset,
1332				       int src_width,
1333				       int src_height,
1334				       int src_pitch,
1335				       uint32_t src_surf_format)
1336{
1337	struct gen4_surface_state *ss;
1338
1339	sna->kgem.surface -= sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
1340
1341	ss = memset(sna->kgem.batch + sna->kgem.surface, 0, sizeof(*ss));
1342	ss->ss0.surface_type = GEN4_SURFACE_2D;
1343	ss->ss0.surface_format = src_surf_format;
1344	ss->ss0.color_blend = 1;
1345
1346	ss->ss1.base_addr =
1347		kgem_add_reloc(&sna->kgem,
1348			       sna->kgem.surface + 1,
1349			       src_bo,
1350			       I915_GEM_DOMAIN_SAMPLER << 16,
1351			       src_offset);
1352
1353	ss->ss2.width  = src_width - 1;
1354	ss->ss2.height = src_height - 1;
1355	ss->ss3.pitch  = src_pitch - 1;
1356
1357	return sna->kgem.surface * sizeof(uint32_t);
1358}
1359
1360static void gen4_video_bind_surfaces(struct sna *sna,
1361				     const struct sna_composite_op *op)
1362{
1363	struct sna_video_frame *frame = op->priv;
1364	uint32_t src_surf_format[6];
1365	uint32_t src_surf_base[6];
1366	int src_width[6];
1367	int src_height[6];
1368	int src_pitch[6];
1369	uint32_t *binding_table;
1370	uint16_t offset, dirty;
1371	int n_src, n;
1372
1373	src_surf_base[0] = 0;
1374	src_surf_base[1] = 0;
1375	src_surf_base[2] = frame->VBufOffset;
1376	src_surf_base[3] = frame->VBufOffset;
1377	src_surf_base[4] = frame->UBufOffset;
1378	src_surf_base[5] = frame->UBufOffset;
1379
1380	if (is_planar_fourcc(frame->id)) {
1381		for (n = 0; n < 2; n++) {
1382			src_surf_format[n] = GEN4_SURFACEFORMAT_R8_UNORM;
1383			src_width[n]  = frame->width;
1384			src_height[n] = frame->height;
1385			src_pitch[n]  = frame->pitch[1];
1386		}
1387		for (; n < 6; n++) {
1388			if (is_nv12_fourcc(frame->id))
1389				src_surf_format[n] = GEN4_SURFACEFORMAT_R8G8_UNORM;
1390			else
1391				src_surf_format[n] = GEN4_SURFACEFORMAT_R8_UNORM;
1392			src_width[n]  = frame->width / 2;
1393			src_height[n] = frame->height / 2;
1394			src_pitch[n]  = frame->pitch[0];
1395		}
1396		n_src = 6;
1397	} else {
1398		if (frame->id == FOURCC_UYVY)
1399			src_surf_format[0] = GEN4_SURFACEFORMAT_YCRCB_SWAPY;
1400		else
1401			src_surf_format[0] = GEN4_SURFACEFORMAT_YCRCB_NORMAL;
1402
1403		src_width[0]  = frame->width;
1404		src_height[0] = frame->height;
1405		src_pitch[0]  = frame->pitch[0];
1406		n_src = 1;
1407	}
1408
1409	gen4_get_batch(sna, op);
1410	dirty = kgem_bo_is_dirty(op->dst.bo);
1411
1412	binding_table = gen4_composite_get_binding_table(sna, &offset);
1413	binding_table[0] =
1414		gen4_bind_bo(sna,
1415			     op->dst.bo, op->dst.width, op->dst.height,
1416			     gen4_get_dest_format(op->dst.format),
1417			     true);
1418	for (n = 0; n < n_src; n++) {
1419		binding_table[1+n] =
1420			gen4_bind_video_source(sna,
1421					       frame->bo,
1422					       src_surf_base[n],
1423					       src_width[n],
1424					       src_height[n],
1425					       src_pitch[n],
1426					       src_surf_format[n]);
1427	}
1428
1429	if (!ALWAYS_FLUSH && sna->kgem.batch[sna->render_state.gen4.surface_table] == binding_table[0])
1430		dirty = 0;
1431
1432	gen4_emit_state(sna, op, offset | dirty);
1433}
1434
1435static unsigned select_video_kernel(const struct sna_video *video,
1436				    const struct sna_video_frame *frame)
1437{
1438	switch (frame->id) {
1439	case FOURCC_YV12:
1440	case FOURCC_I420:
1441	case FOURCC_XVMC:
1442		return video->colorspace ?
1443			WM_KERNEL_VIDEO_PLANAR_BT709 :
1444			WM_KERNEL_VIDEO_PLANAR_BT601;
1445
1446	case FOURCC_NV12:
1447		return video->colorspace ?
1448			WM_KERNEL_VIDEO_NV12_BT709 :
1449			WM_KERNEL_VIDEO_NV12_BT601;
1450
1451	default:
1452		return video->colorspace ?
1453			WM_KERNEL_VIDEO_PACKED_BT709 :
1454			WM_KERNEL_VIDEO_PACKED_BT601;
1455	}
1456}
1457
1458static bool
1459gen4_render_video(struct sna *sna,
1460		  struct sna_video *video,
1461		  struct sna_video_frame *frame,
1462		  RegionPtr dstRegion,
1463		  PixmapPtr pixmap)
1464{
1465	struct sna_composite_op tmp;
1466	struct sna_pixmap *priv = sna_pixmap(pixmap);
1467	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
1468	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
1469	int src_width = frame->src.x2 - frame->src.x1;
1470	int src_height = frame->src.y2 - frame->src.y1;
1471	float src_offset_x, src_offset_y;
1472	float src_scale_x, src_scale_y;
1473	const BoxRec *box;
1474	int nbox;
1475
1476	DBG(("%s: %dx%d -> %dx%d\n", __FUNCTION__,
1477	     src_width, src_height, dst_width, dst_height));
1478
1479	assert(priv->gpu_bo);
1480	memset(&tmp, 0, sizeof(tmp));
1481
1482	tmp.op = PictOpSrc;
1483	tmp.dst.pixmap = pixmap;
1484	tmp.dst.width  = pixmap->drawable.width;
1485	tmp.dst.height = pixmap->drawable.height;
1486	tmp.dst.format = sna_format_for_depth(pixmap->drawable.depth);
1487	tmp.dst.bo = priv->gpu_bo;
1488
1489	if (src_width == dst_width && src_height == dst_height)
1490		tmp.src.filter = SAMPLER_FILTER_NEAREST;
1491	else
1492		tmp.src.filter = SAMPLER_FILTER_BILINEAR;
1493	tmp.src.repeat = SAMPLER_EXTEND_PAD;
1494	tmp.src.bo = frame->bo;
1495	tmp.mask.bo = NULL;
1496	tmp.u.gen4.wm_kernel = select_video_kernel(video, frame);
1497	tmp.u.gen4.ve_id = 2;
1498	tmp.is_affine = true;
1499	tmp.floats_per_vertex = 3;
1500	tmp.floats_per_rect = 9;
1501	tmp.priv = frame;
1502
1503	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
1504		kgem_submit(&sna->kgem);
1505		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
1506			return false;
1507	}
1508
1509	gen4_align_vertex(sna, &tmp);
1510	gen4_video_bind_surfaces(sna, &tmp);
1511
1512	src_scale_x = (float)src_width / dst_width / frame->width;
1513	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
1514
1515	src_scale_y = (float)src_height / dst_height / frame->height;
1516	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
1517
1518	box = region_rects(dstRegion);
1519	nbox = region_num_rects(dstRegion);
1520	do {
1521		int n;
1522
1523		n = gen4_get_rectangles(sna, &tmp, nbox,
1524					gen4_video_bind_surfaces);
1525		assert(n);
1526		nbox -= n;
1527
1528		do {
1529			OUT_VERTEX(box->x2, box->y2);
1530			OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
1531			OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
1532
1533			OUT_VERTEX(box->x1, box->y2);
1534			OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
1535			OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
1536
1537			OUT_VERTEX(box->x1, box->y1);
1538			OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
1539			OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
1540
1541			box++;
1542		} while (--n);
1543	} while (nbox);
1544	gen4_vertex_flush(sna);
1545
1546	if (!DAMAGE_IS_ALL(priv->gpu_damage))
1547		sna_damage_add(&priv->gpu_damage, dstRegion);
1548
1549	return true;
1550}
1551
1552static int
1553gen4_composite_picture(struct sna *sna,
1554		       PicturePtr picture,
1555		       struct sna_composite_channel *channel,
1556		       int x, int y,
1557		       int w, int h,
1558		       int dst_x, int dst_y,
1559		       bool precise)
1560{
1561	PixmapPtr pixmap;
1562	uint32_t color;
1563	int16_t dx, dy;
1564
1565	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
1566	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
1567
1568	channel->is_solid = false;
1569	channel->card_format = -1;
1570
1571	if (sna_picture_is_solid(picture, &color))
1572		return gen4_channel_init_solid(sna, channel, color);
1573
1574	if (picture->pDrawable == NULL) {
1575		int ret;
1576
1577		if (picture->pSourcePict->type == SourcePictTypeLinear)
1578			return gen4_channel_init_linear(sna, picture, channel,
1579							x, y,
1580							w, h,
1581							dst_x, dst_y);
1582
1583		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
1584		ret = -1;
1585		if (!precise)
1586			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
1587								      x, y, w, h, dst_x, dst_y);
1588		if (ret == -1)
1589			ret = sna_render_picture_fixup(sna, picture, channel,
1590						       x, y, w, h, dst_x, dst_y);
1591		return ret;
1592	}
1593
1594	if (picture->alphaMap) {
1595		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
1596		return sna_render_picture_fixup(sna, picture, channel,
1597						x, y, w, h, dst_x, dst_y);
1598	}
1599
1600	if (!gen4_check_repeat(picture)) {
1601		DBG(("%s: unknown repeat mode fixup\n", __FUNCTION__));
1602		return sna_render_picture_fixup(sna, picture, channel,
1603						x, y, w, h, dst_x, dst_y);
1604	}
1605
1606	if (!gen4_check_filter(picture)) {
1607		DBG(("%s: unhandled filter fixup\n", __FUNCTION__));
1608		return sna_render_picture_fixup(sna, picture, channel,
1609						x, y, w, h, dst_x, dst_y);
1610	}
1611
1612	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
1613	channel->filter = picture->filter;
1614
1615	pixmap = get_drawable_pixmap(picture->pDrawable);
1616	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
1617
1618	x += dx + picture->pDrawable->x;
1619	y += dy + picture->pDrawable->y;
1620
1621	channel->is_affine = sna_transform_is_affine(picture->transform);
1622	if (sna_transform_is_imprecise_integer_translation(picture->transform, picture->filter, precise, &dx, &dy)) {
1623		DBG(("%s: integer translation (%d, %d), removing\n",
1624		     __FUNCTION__, dx, dy));
1625		x += dx;
1626		y += dy;
1627		channel->transform = NULL;
1628		channel->filter = PictFilterNearest;
1629
1630		if (channel->repeat &&
1631		    (x >= 0 &&
1632		     y >= 0 &&
1633		     x + w <= pixmap->drawable.width &&
1634		     y + h <= pixmap->drawable.height)) {
1635			struct sna_pixmap *priv = sna_pixmap(pixmap);
1636			if (priv && priv->clear) {
1637				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
1638				return gen4_channel_init_solid(sna, channel,
1639							       solid_color(picture->format,
1640									   priv->clear_color));
1641			}
1642		}
1643	} else
1644		channel->transform = picture->transform;
1645
1646	channel->pict_format = picture->format;
1647	channel->card_format = gen4_get_card_format(picture->format);
1648	if (channel->card_format == -1)
1649		return sna_render_picture_convert(sna, picture, channel, pixmap,
1650						  x, y, w, h, dst_x, dst_y,
1651						  false);
1652
1653	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
1654		return sna_render_picture_extract(sna, picture, channel,
1655						  x, y, w, h, dst_x, dst_y);
1656
1657	return sna_render_pixmap_bo(sna, channel, pixmap,
1658				    x, y, w, h, dst_x, dst_y);
1659}
1660
1661static void gen4_composite_channel_convert(struct sna_composite_channel *channel)
1662{
1663	DBG(("%s: repeat %d -> %d, filter %d -> %d\n",
1664	     __FUNCTION__,
1665	     channel->repeat, gen4_repeat(channel->repeat),
1666	     channel->filter, gen4_repeat(channel->filter)));
1667	channel->repeat = gen4_repeat(channel->repeat);
1668	channel->filter = gen4_filter(channel->filter);
1669	if (channel->card_format == (unsigned)-1)
1670		channel->card_format = gen4_get_card_format(channel->pict_format);
1671}
1672
1673static void
1674gen4_render_composite_done(struct sna *sna,
1675			   const struct sna_composite_op *op)
1676{
1677	DBG(("%s()\n", __FUNCTION__));
1678
1679	if (sna->render.vertex_offset) {
1680		gen4_vertex_flush(sna);
1681		gen4_magic_ca_pass(sna, op);
1682	}
1683
1684	if (op->mask.bo)
1685		kgem_bo_destroy(&sna->kgem, op->mask.bo);
1686	if (op->src.bo)
1687		kgem_bo_destroy(&sna->kgem, op->src.bo);
1688
1689	sna_render_composite_redirect_done(sna, op);
1690}
1691
1692static bool
1693gen4_composite_set_target(struct sna *sna,
1694			  struct sna_composite_op *op,
1695			  PicturePtr dst,
1696			  int x, int y, int w, int h,
1697			  bool partial)
1698{
1699	BoxRec box;
1700	unsigned hint;
1701
1702	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
1703	op->dst.width  = op->dst.pixmap->drawable.width;
1704	op->dst.height = op->dst.pixmap->drawable.height;
1705	op->dst.format = dst->format;
1706	if (w && h) {
1707		box.x1 = x;
1708		box.y1 = y;
1709		box.x2 = x + w;
1710		box.y2 = y + h;
1711	} else
1712		sna_render_picture_extents(dst, &box);
1713
1714	hint = PREFER_GPU | RENDER_GPU;
1715	if (!need_tiling(sna, op->dst.width, op->dst.height))
1716		hint |= FORCE_GPU;
1717	if (!partial) {
1718		hint |= IGNORE_DAMAGE;
1719		if (w == op->dst.width && h == op->dst.height)
1720			hint |= REPLACES;
1721	}
1722
1723	op->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint, &box, &op->damage);
1724	if (op->dst.bo == NULL)
1725		return false;
1726
1727	if (hint & REPLACES) {
1728		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
1729		kgem_bo_pair_undo(&sna->kgem, priv->gpu_bo, priv->cpu_bo);
1730	}
1731
1732	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
1733			    &op->dst.x, &op->dst.y);
1734
1735	DBG(("%s: pixmap=%ld, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
1736	     __FUNCTION__,
1737	     op->dst.pixmap->drawable.serialNumber, (int)op->dst.format,
1738	     op->dst.width, op->dst.height,
1739	     op->dst.bo->pitch,
1740	     op->dst.x, op->dst.y,
1741	     op->damage ? *op->damage : (void *)-1));
1742
1743	assert(op->dst.bo->proxy == NULL);
1744
1745	if (too_large(op->dst.width, op->dst.height) &&
1746	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
1747		return false;
1748
1749	return true;
1750}
1751
1752static bool
1753check_gradient(PicturePtr picture, bool precise)
1754{
1755	switch (picture->pSourcePict->type) {
1756	case SourcePictTypeSolidFill:
1757	case SourcePictTypeLinear:
1758		return false;
1759	default:
1760		return precise;
1761	}
1762}
1763
1764static bool
1765has_alphamap(PicturePtr p)
1766{
1767	return p->alphaMap != NULL;
1768}
1769
1770static bool
1771need_upload(struct sna *sna, PicturePtr p)
1772{
1773	return p->pDrawable && untransformed(p) &&
1774		!is_gpu(sna, p->pDrawable, PREFER_GPU_RENDER);
1775}
1776
1777static bool
1778source_is_busy(PixmapPtr pixmap)
1779{
1780	struct sna_pixmap *priv = sna_pixmap(pixmap);
1781	if (priv == NULL)
1782		return false;
1783
1784	if (priv->clear)
1785		return false;
1786
1787	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
1788		return true;
1789
1790	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
1791		return true;
1792
1793	return priv->gpu_damage && !priv->cpu_damage;
1794}
1795
1796static bool
1797source_fallback(struct sna *sna, PicturePtr p, PixmapPtr pixmap, bool precise)
1798{
1799	if (sna_picture_is_solid(p, NULL))
1800		return false;
1801
1802	if (p->pSourcePict)
1803		return check_gradient(p, precise);
1804
1805	if (!gen4_check_repeat(p) || !gen4_check_format(p->format))
1806		return true;
1807
1808	/* soft errors: perfer to upload/compute rather than readback */
1809	if (pixmap && source_is_busy(pixmap))
1810		return false;
1811
1812	return has_alphamap(p) || !gen4_check_filter(p) || need_upload(sna, p);
1813}
1814
1815static bool
1816gen4_composite_fallback(struct sna *sna,
1817			PicturePtr src,
1818			PicturePtr mask,
1819			PicturePtr dst)
1820{
1821	PixmapPtr src_pixmap;
1822	PixmapPtr mask_pixmap;
1823	PixmapPtr dst_pixmap;
1824	bool src_fallback, mask_fallback;
1825
1826	if (!gen4_check_dst_format(dst->format)) {
1827		DBG(("%s: unknown destination format: %d\n",
1828		     __FUNCTION__, dst->format));
1829		return true;
1830	}
1831
1832	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
1833
1834	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
1835	src_fallback = source_fallback(sna, src, src_pixmap,
1836				       dst->polyMode == PolyModePrecise);
1837
1838	if (mask) {
1839		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
1840		mask_fallback = source_fallback(sna, mask, mask_pixmap,
1841						dst->polyMode == PolyModePrecise);
1842	} else {
1843		mask_pixmap = NULL;
1844		mask_fallback = false;
1845	}
1846
1847	/* If we are using the destination as a source and need to
1848	 * readback in order to upload the source, do it all
1849	 * on the cpu.
1850	 */
1851	if (src_pixmap == dst_pixmap && src_fallback) {
1852		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
1853		return true;
1854	}
1855	if (mask_pixmap == dst_pixmap && mask_fallback) {
1856		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
1857		return true;
1858	}
1859
1860	/* If anything is on the GPU, push everything out to the GPU */
1861	if (dst_use_gpu(dst_pixmap)) {
1862		DBG(("%s: dst is already on the GPU, try to use GPU\n",
1863		     __FUNCTION__));
1864		return false;
1865	}
1866
1867	if (src_pixmap && !src_fallback) {
1868		DBG(("%s: src is already on the GPU, try to use GPU\n",
1869		     __FUNCTION__));
1870		return false;
1871	}
1872	if (mask_pixmap && !mask_fallback) {
1873		DBG(("%s: mask is already on the GPU, try to use GPU\n",
1874		     __FUNCTION__));
1875		return false;
1876	}
1877
1878	/* However if the dst is not on the GPU and we need to
1879	 * render one of the sources using the CPU, we may
1880	 * as well do the entire operation in place onthe CPU.
1881	 */
1882	if (src_fallback) {
1883		DBG(("%s: dst is on the CPU and src will fallback\n",
1884		     __FUNCTION__));
1885		return true;
1886	}
1887
1888	if (mask_fallback) {
1889		DBG(("%s: dst is on the CPU and mask will fallback\n",
1890		     __FUNCTION__));
1891		return true;
1892	}
1893
1894	if (too_large(dst_pixmap->drawable.width,
1895		      dst_pixmap->drawable.height) &&
1896	    dst_is_cpu(dst_pixmap)) {
1897		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
1898		return true;
1899	}
1900
1901	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
1902	     __FUNCTION__));
1903	return dst_use_cpu(dst_pixmap);
1904}
1905
1906static int
1907reuse_source(struct sna *sna,
1908	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
1909	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
1910{
1911	uint32_t color;
1912
1913	if (src_x != msk_x || src_y != msk_y)
1914		return false;
1915
1916	if (src == mask) {
1917		DBG(("%s: mask is source\n", __FUNCTION__));
1918		*mc = *sc;
1919		mc->bo = kgem_bo_reference(mc->bo);
1920		return true;
1921	}
1922
1923	if (sna_picture_is_solid(mask, &color))
1924		return gen4_channel_init_solid(sna, mc, color);
1925
1926	if (sc->is_solid)
1927		return false;
1928
1929	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
1930		return false;
1931
1932	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
1933
1934	if (!sna_transform_equal(src->transform, mask->transform))
1935		return false;
1936
1937	if (!sna_picture_alphamap_equal(src, mask))
1938		return false;
1939
1940	if (!gen4_check_repeat(mask))
1941		return false;
1942
1943	if (!gen4_check_filter(mask))
1944		return false;
1945
1946	if (!gen4_check_format(mask->format))
1947		return false;
1948
1949	DBG(("%s: reusing source channel for mask with a twist\n",
1950	     __FUNCTION__));
1951
1952	*mc = *sc;
1953	mc->repeat = gen4_repeat(mask->repeat ? mask->repeatType : RepeatNone);
1954	mc->filter = gen4_filter(mask->filter);
1955	mc->pict_format = mask->format;
1956	mc->card_format = gen4_get_card_format(mask->format);
1957	mc->bo = kgem_bo_reference(mc->bo);
1958	return true;
1959}
1960
1961static bool
1962gen4_render_composite(struct sna *sna,
1963		      uint8_t op,
1964		      PicturePtr src,
1965		      PicturePtr mask,
1966		      PicturePtr dst,
1967		      int16_t src_x, int16_t src_y,
1968		      int16_t msk_x, int16_t msk_y,
1969		      int16_t dst_x, int16_t dst_y,
1970		      int16_t width, int16_t height,
1971		      unsigned flags,
1972		      struct sna_composite_op *tmp)
1973{
1974	DBG(("%s: %dx%d, current mode=%d\n", __FUNCTION__,
1975	     width, height, sna->kgem.mode));
1976
1977	if (op >= ARRAY_SIZE(gen4_blend_op))
1978		return false;
1979
1980	if (mask == NULL &&
1981	    sna_blt_composite(sna, op,
1982			      src, dst,
1983			      src_x, src_y,
1984			      dst_x, dst_y,
1985			      width, height,
1986			      flags, tmp))
1987		return true;
1988
1989	if (gen4_composite_fallback(sna, src, mask, dst))
1990		goto fallback;
1991
1992	if (need_tiling(sna, width, height))
1993		return sna_tiling_composite(op, src, mask, dst,
1994					    src_x, src_y,
1995					    msk_x, msk_y,
1996					    dst_x, dst_y,
1997					    width, height,
1998					    tmp);
1999
2000	if (!gen4_composite_set_target(sna, tmp, dst,
2001				       dst_x, dst_y, width, height,
2002				       flags & COMPOSITE_PARTIAL || op > PictOpSrc)) {
2003		DBG(("%s: failed to set composite target\n", __FUNCTION__));
2004		goto fallback;
2005	}
2006
2007	tmp->op = op;
2008	switch (gen4_composite_picture(sna, src, &tmp->src,
2009				       src_x, src_y,
2010				       width, height,
2011				       dst_x, dst_y,
2012				       dst->polyMode == PolyModePrecise)) {
2013	case -1:
2014		DBG(("%s: failed to prepare source\n", __FUNCTION__));
2015		goto cleanup_dst;
2016	case 0:
2017		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
2018			goto cleanup_dst;
2019		/* fall through to fixup */
2020	case 1:
2021		if (mask == NULL &&
2022		    sna_blt_composite__convert(sna,
2023					       dst_x, dst_y, width, height,
2024					       tmp))
2025			return true;
2026
2027		gen4_composite_channel_convert(&tmp->src);
2028		break;
2029	}
2030
2031	tmp->is_affine = tmp->src.is_affine;
2032	tmp->has_component_alpha = false;
2033	tmp->need_magic_ca_pass = false;
2034
2035	if (mask) {
2036		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
2037			tmp->has_component_alpha = true;
2038
2039			/* Check if it's component alpha that relies on a source alpha and on
2040			 * the source value.  We can only get one of those into the single
2041			 * source value that we get to blend with.
2042			 */
2043			if (gen4_blend_op[op].src_alpha &&
2044			    (gen4_blend_op[op].src_blend != GEN4_BLENDFACTOR_ZERO)) {
2045				if (op != PictOpOver) {
2046					DBG(("%s -- fallback: unhandled component alpha blend\n",
2047					     __FUNCTION__));
2048
2049					goto cleanup_src;
2050				}
2051
2052				tmp->need_magic_ca_pass = true;
2053				tmp->op = PictOpOutReverse;
2054			}
2055		}
2056
2057		if (!reuse_source(sna,
2058				  src, &tmp->src, src_x, src_y,
2059				  mask, &tmp->mask, msk_x, msk_y)) {
2060			switch (gen4_composite_picture(sna, mask, &tmp->mask,
2061						       msk_x, msk_y,
2062						       width, height,
2063						       dst_x, dst_y,
2064						       dst->polyMode == PolyModePrecise)) {
2065			case -1:
2066				DBG(("%s: failed to prepare mask\n", __FUNCTION__));
2067				goto cleanup_src;
2068			case 0:
2069				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
2070					goto cleanup_src;
2071				/* fall through to fixup */
2072			case 1:
2073				gen4_composite_channel_convert(&tmp->mask);
2074				break;
2075			}
2076		}
2077
2078		tmp->is_affine &= tmp->mask.is_affine;
2079	}
2080
2081	tmp->u.gen4.wm_kernel =
2082		gen4_choose_composite_kernel(tmp->op,
2083					     tmp->mask.bo != NULL,
2084					     tmp->has_component_alpha,
2085					     tmp->is_affine);
2086	tmp->u.gen4.ve_id = gen4_choose_composite_emitter(sna, tmp);
2087
2088	tmp->blt   = gen4_render_composite_blt;
2089	tmp->box   = gen4_render_composite_box;
2090	tmp->boxes = gen4_render_composite_boxes__blt;
2091	if (tmp->emit_boxes) {
2092		tmp->boxes = gen4_render_composite_boxes;
2093#if !FORCE_FLUSH
2094		tmp->thread_boxes = gen4_render_composite_boxes__thread;
2095#endif
2096	}
2097	tmp->done  = gen4_render_composite_done;
2098
2099	if (!kgem_check_bo(&sna->kgem,
2100			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2101			   NULL)) {
2102		kgem_submit(&sna->kgem);
2103		if (!kgem_check_bo(&sna->kgem,
2104				     tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2105				     NULL))
2106			goto cleanup_mask;
2107	}
2108
2109	gen4_align_vertex(sna, tmp);
2110	gen4_bind_surfaces(sna, tmp);
2111	return true;
2112
2113cleanup_mask:
2114	if (tmp->mask.bo) {
2115		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
2116		tmp->mask.bo = NULL;
2117	}
2118cleanup_src:
2119	if (tmp->src.bo) {
2120		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
2121		tmp->src.bo = NULL;
2122	}
2123cleanup_dst:
2124	if (tmp->redirect.real_bo) {
2125		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
2126		tmp->redirect.real_bo = NULL;
2127	}
2128fallback:
2129	return (mask == NULL &&
2130		sna_blt_composite(sna, op,
2131				  src, dst,
2132				  src_x, src_y,
2133				  dst_x, dst_y,
2134				  width, height,
2135				  flags | COMPOSITE_FALLBACK, tmp));
2136}
2137
2138#if !NO_COMPOSITE_SPANS
2139fastcall static void
2140gen4_render_composite_spans_box(struct sna *sna,
2141				const struct sna_composite_spans_op *op,
2142				const BoxRec *box, float opacity)
2143{
2144	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
2145	     __FUNCTION__,
2146	     op->base.src.offset[0], op->base.src.offset[1],
2147	     opacity,
2148	     op->base.dst.x, op->base.dst.y,
2149	     box->x1, box->y1,
2150	     box->x2 - box->x1,
2151	     box->y2 - box->y1));
2152
2153	gen4_get_rectangles(sna, &op->base, 1, gen4_bind_surfaces);
2154	op->prim_emit(sna, op, box, opacity);
2155}
2156
2157static void
2158gen4_render_composite_spans_boxes(struct sna *sna,
2159				  const struct sna_composite_spans_op *op,
2160				  const BoxRec *box, int nbox,
2161				  float opacity)
2162{
2163	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
2164	     __FUNCTION__, nbox,
2165	     op->base.src.offset[0], op->base.src.offset[1],
2166	     opacity,
2167	     op->base.dst.x, op->base.dst.y));
2168
2169	do {
2170		int nbox_this_time;
2171
2172		nbox_this_time = gen4_get_rectangles(sna, &op->base, nbox,
2173						     gen4_bind_surfaces);
2174		nbox -= nbox_this_time;
2175
2176		do {
2177			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
2178			     box->x1, box->y1,
2179			     box->x2 - box->x1,
2180			     box->y2 - box->y1));
2181
2182			op->prim_emit(sna, op, box++, opacity);
2183		} while (--nbox_this_time);
2184	} while (nbox);
2185}
2186
2187fastcall static void
2188gen4_render_composite_spans_boxes__thread(struct sna *sna,
2189					  const struct sna_composite_spans_op *op,
2190					  const struct sna_opacity_box *box,
2191					  int nbox)
2192{
2193	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
2194	     __FUNCTION__, nbox,
2195	     op->base.src.offset[0], op->base.src.offset[1],
2196	     op->base.dst.x, op->base.dst.y));
2197	assert(nbox);
2198
2199	sna_vertex_lock(&sna->render);
2200	do {
2201		int nbox_this_time;
2202		float *v;
2203
2204		nbox_this_time = gen4_get_rectangles(sna, &op->base, nbox,
2205						     gen4_bind_surfaces);
2206		assert(nbox_this_time);
2207		nbox -= nbox_this_time;
2208
2209		v = sna->render.vertices + sna->render.vertex_used;
2210		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
2211
2212		sna_vertex_acquire__locked(&sna->render);
2213		sna_vertex_unlock(&sna->render);
2214
2215		op->emit_boxes(op, box, nbox_this_time, v);
2216		box += nbox_this_time;
2217
2218		sna_vertex_lock(&sna->render);
2219		sna_vertex_release__locked(&sna->render);
2220	} while (nbox);
2221	sna_vertex_unlock(&sna->render);
2222}
2223
2224fastcall static void
2225gen4_render_composite_spans_done(struct sna *sna,
2226				 const struct sna_composite_spans_op *op)
2227{
2228	if (sna->render.vertex_offset)
2229		gen4_vertex_flush(sna);
2230
2231	DBG(("%s()\n", __FUNCTION__));
2232
2233	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2234	sna_render_composite_redirect_done(sna, &op->base);
2235}
2236
2237static bool
2238gen4_check_composite_spans(struct sna *sna,
2239			   uint8_t op, PicturePtr src, PicturePtr dst,
2240			   int16_t width, int16_t height,
2241			   unsigned flags)
2242{
2243	DBG(("%s: op=%d, width=%d, height=%d, flags=%x\n",
2244	     __FUNCTION__, op, width, height, flags));
2245
2246	if (op >= ARRAY_SIZE(gen4_blend_op))
2247		return false;
2248
2249	if (gen4_composite_fallback(sna, src, NULL, dst)) {
2250		DBG(("%s: operation would fallback\n", __FUNCTION__));
2251		return false;
2252	}
2253
2254	if (need_tiling(sna, width, height) &&
2255	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
2256		DBG(("%s: fallback, tiled operation not on GPU\n",
2257		     __FUNCTION__));
2258		return false;
2259	}
2260
2261	if (FORCE_SPANS)
2262		return FORCE_SPANS > 0;
2263
2264	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0) {
2265		struct sna_pixmap *priv;
2266
2267		if (FORCE_NONRECTILINEAR_SPANS)
2268			return FORCE_NONRECTILINEAR_SPANS > 0;
2269
2270		if ((sna->render.prefer_gpu & PREFER_GPU_SPANS) == 0)
2271			return false;
2272
2273		priv = sna_pixmap_from_drawable(dst->pDrawable);
2274		assert(priv);
2275
2276		if (priv->cpu_bo &&
2277		    __kgem_bo_is_busy(&sna->kgem, priv->cpu_bo))
2278			return true;
2279
2280		if (flags & COMPOSITE_SPANS_INPLACE_HINT)
2281			return false;
2282
2283		return priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo);
2284	}
2285
2286	return true;
2287}
2288
2289static bool
2290gen4_render_composite_spans(struct sna *sna,
2291			    uint8_t op,
2292			    PicturePtr src,
2293			    PicturePtr dst,
2294			    int16_t src_x,  int16_t src_y,
2295			    int16_t dst_x,  int16_t dst_y,
2296			    int16_t width,  int16_t height,
2297			    unsigned flags,
2298			    struct sna_composite_spans_op *tmp)
2299{
2300	DBG(("%s: %dx%d with flags=%x, current mode=%d\n", __FUNCTION__,
2301	     width, height, flags, sna->kgem.ring));
2302
2303	assert(gen4_check_composite_spans(sna, op, src, dst, width, height, flags));
2304
2305	if (need_tiling(sna, width, height)) {
2306		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
2307		     __FUNCTION__, width, height));
2308		return sna_tiling_composite_spans(op, src, dst,
2309						  src_x, src_y, dst_x, dst_y,
2310						  width, height, flags, tmp);
2311	}
2312
2313	tmp->base.op = op;
2314	if (!gen4_composite_set_target(sna, &tmp->base, dst,
2315				       dst_x, dst_y, width, height, true))
2316		return false;
2317
2318	switch (gen4_composite_picture(sna, src, &tmp->base.src,
2319				       src_x, src_y,
2320				       width, height,
2321				       dst_x, dst_y,
2322				       dst->polyMode == PolyModePrecise)) {
2323	case -1:
2324		goto cleanup_dst;
2325	case 0:
2326		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
2327			goto cleanup_dst;
2328		/* fall through to fixup */
2329	case 1:
2330		gen4_composite_channel_convert(&tmp->base.src);
2331		break;
2332	}
2333
2334	tmp->base.mask.bo = NULL;
2335	tmp->base.mask.filter = SAMPLER_FILTER_NEAREST;
2336	tmp->base.mask.repeat = SAMPLER_EXTEND_NONE;
2337
2338	tmp->base.is_affine = tmp->base.src.is_affine;
2339	tmp->base.has_component_alpha = false;
2340	tmp->base.need_magic_ca_pass = false;
2341
2342	tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(sna, tmp);
2343	tmp->base.u.gen4.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
2344
2345	tmp->box   = gen4_render_composite_spans_box;
2346	tmp->boxes = gen4_render_composite_spans_boxes;
2347	if (tmp->emit_boxes)
2348		tmp->thread_boxes = gen4_render_composite_spans_boxes__thread;
2349	tmp->done  = gen4_render_composite_spans_done;
2350
2351	if (!kgem_check_bo(&sna->kgem,
2352			   tmp->base.dst.bo, tmp->base.src.bo,
2353			   NULL))  {
2354		kgem_submit(&sna->kgem);
2355		if (!kgem_check_bo(&sna->kgem,
2356				   tmp->base.dst.bo, tmp->base.src.bo,
2357				   NULL))
2358			goto cleanup_src;
2359	}
2360
2361	gen4_align_vertex(sna, &tmp->base);
2362	gen4_bind_surfaces(sna, &tmp->base);
2363	return true;
2364
2365cleanup_src:
2366	if (tmp->base.src.bo)
2367		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
2368cleanup_dst:
2369	if (tmp->base.redirect.real_bo)
2370		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
2371	return false;
2372}
2373#endif
2374
2375static void
2376gen4_copy_bind_surfaces(struct sna *sna, const struct sna_composite_op *op)
2377{
2378	uint32_t *binding_table;
2379	uint16_t offset, dirty;
2380
2381	gen4_get_batch(sna, op);
2382	dirty = kgem_bo_is_dirty(op->dst.bo);
2383
2384	binding_table = gen4_composite_get_binding_table(sna, &offset);
2385
2386	binding_table[0] =
2387		gen4_bind_bo(sna,
2388			     op->dst.bo, op->dst.width, op->dst.height,
2389			     gen4_get_dest_format(op->dst.format),
2390			     true);
2391	binding_table[1] =
2392		gen4_bind_bo(sna,
2393			     op->src.bo, op->src.width, op->src.height,
2394			     op->src.card_format,
2395			     false);
2396
2397	if (sna->kgem.surface == offset &&
2398	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen4.surface_table) == *(uint64_t*)binding_table) {
2399		sna->kgem.surface += sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
2400		offset = sna->render_state.gen4.surface_table;
2401	}
2402
2403	if (!ALWAYS_FLUSH && sna->kgem.batch[sna->render_state.gen4.surface_table] == binding_table[0])
2404		dirty = 0;
2405
2406	gen4_emit_state(sna, op, offset | dirty);
2407}
2408
2409static void
2410gen4_render_copy_one(struct sna *sna,
2411		     const struct sna_composite_op *op,
2412		     int sx, int sy,
2413		     int w, int h,
2414		     int dx, int dy)
2415{
2416	gen4_get_rectangles(sna, op, 1, gen4_copy_bind_surfaces);
2417
2418	OUT_VERTEX(dx+w, dy+h);
2419	OUT_VERTEX_F((sx+w)*op->src.scale[0]);
2420	OUT_VERTEX_F((sy+h)*op->src.scale[1]);
2421
2422	OUT_VERTEX(dx, dy+h);
2423	OUT_VERTEX_F(sx*op->src.scale[0]);
2424	OUT_VERTEX_F((sy+h)*op->src.scale[1]);
2425
2426	OUT_VERTEX(dx, dy);
2427	OUT_VERTEX_F(sx*op->src.scale[0]);
2428	OUT_VERTEX_F(sy*op->src.scale[1]);
2429}
2430
2431static bool
2432gen4_render_copy_boxes(struct sna *sna, uint8_t alu,
2433		       const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
2434		       const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
2435		       const BoxRec *box, int n, unsigned flags)
2436{
2437	struct sna_composite_op tmp;
2438
2439	DBG(("%s x %d\n", __FUNCTION__, n));
2440
2441	if (sna_blt_compare_depth(src, dst) &&
2442	    sna_blt_copy_boxes(sna, alu,
2443			       src_bo, src_dx, src_dy,
2444			       dst_bo, dst_dx, dst_dy,
2445			       dst->bitsPerPixel,
2446			       box, n))
2447		return true;
2448
2449	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo) {
2450fallback_blt:
2451		if (!sna_blt_compare_depth(src, dst))
2452			return false;
2453
2454		return sna_blt_copy_boxes_fallback(sna, alu,
2455						   src, src_bo, src_dx, src_dy,
2456						   dst, dst_bo, dst_dx, dst_dy,
2457						   box, n);
2458	}
2459
2460	memset(&tmp, 0, sizeof(tmp));
2461
2462	DBG(("%s (%d, %d)->(%d, %d) x %d\n",
2463	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n));
2464
2465	if (dst->depth == src->depth) {
2466		tmp.dst.format = sna_render_format_for_depth(dst->depth);
2467		tmp.src.pict_format = tmp.dst.format;
2468	} else {
2469		tmp.dst.format = sna_format_for_depth(dst->depth);
2470		tmp.src.pict_format = sna_format_for_depth(src->depth);
2471	}
2472	if (!gen4_check_format(tmp.src.pict_format))
2473		goto fallback_blt;
2474
2475	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
2476
2477	tmp.dst.pixmap = (PixmapPtr)dst;
2478	tmp.dst.width  = dst->width;
2479	tmp.dst.height = dst->height;
2480	tmp.dst.x = tmp.dst.y = 0;
2481	tmp.dst.bo = dst_bo;
2482	tmp.damage = NULL;
2483
2484	sna_render_composite_redirect_init(&tmp);
2485	if (too_large(tmp.dst.width, tmp.dst.height)) {
2486		BoxRec extents = box[0];
2487		int i;
2488
2489		for (i = 1; i < n; i++) {
2490			if (box[i].x1 < extents.x1)
2491				extents.x1 = box[i].x1;
2492			if (box[i].y1 < extents.y1)
2493				extents.y1 = box[i].y1;
2494
2495			if (box[i].x2 > extents.x2)
2496				extents.x2 = box[i].x2;
2497			if (box[i].y2 > extents.y2)
2498				extents.y2 = box[i].y2;
2499		}
2500		if (!sna_render_composite_redirect(sna, &tmp,
2501						   extents.x1 + dst_dx,
2502						   extents.y1 + dst_dy,
2503						   extents.x2 - extents.x1,
2504						   extents.y2 - extents.y1,
2505						   n > 1))
2506			goto fallback_tiled;
2507	}
2508
2509	tmp.src.filter = SAMPLER_FILTER_NEAREST;
2510	tmp.src.repeat = SAMPLER_EXTEND_NONE;
2511	tmp.src.card_format = gen4_get_card_format(tmp.src.pict_format);
2512	if (too_large(src->width, src->height)) {
2513		BoxRec extents = box[0];
2514		int i;
2515
2516		for (i = 1; i < n; i++) {
2517			if (box[i].x1 < extents.x1)
2518				extents.x1 = box[i].x1;
2519			if (box[i].y1 < extents.y1)
2520				extents.y1 = box[i].y1;
2521
2522			if (box[i].x2 > extents.x2)
2523				extents.x2 = box[i].x2;
2524			if (box[i].y2 > extents.y2)
2525				extents.y2 = box[i].y2;
2526		}
2527
2528		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
2529					       extents.x1 + src_dx,
2530					       extents.y1 + src_dy,
2531					       extents.x2 - extents.x1,
2532					       extents.y2 - extents.y1))
2533			goto fallback_tiled_dst;
2534	} else {
2535		tmp.src.bo = kgem_bo_reference(src_bo);
2536		tmp.src.width  = src->width;
2537		tmp.src.height = src->height;
2538		tmp.src.offset[0] = tmp.src.offset[1] = 0;
2539		tmp.src.scale[0] = 1.f/src->width;
2540		tmp.src.scale[1] = 1.f/src->height;
2541	}
2542
2543	tmp.is_affine = true;
2544	tmp.floats_per_vertex = 3;
2545	tmp.floats_per_rect = 9;
2546	tmp.u.gen4.wm_kernel = WM_KERNEL;
2547	tmp.u.gen4.ve_id = 2;
2548
2549	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2550		kgem_submit(&sna->kgem);
2551		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2552			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2553			if (tmp.redirect.real_bo)
2554				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2555
2556			goto fallback_blt;
2557		}
2558	}
2559
2560	dst_dx += tmp.dst.x;
2561	dst_dy += tmp.dst.y;
2562	tmp.dst.x = tmp.dst.y = 0;
2563
2564	src_dx += tmp.src.offset[0];
2565	src_dy += tmp.src.offset[1];
2566
2567	gen4_align_vertex(sna, &tmp);
2568	gen4_copy_bind_surfaces(sna, &tmp);
2569
2570	do {
2571		gen4_render_copy_one(sna, &tmp,
2572				     box->x1 + src_dx, box->y1 + src_dy,
2573				     box->x2 - box->x1, box->y2 - box->y1,
2574				     box->x1 + dst_dx, box->y1 + dst_dy);
2575		box++;
2576	} while (--n);
2577
2578	gen4_vertex_flush(sna);
2579	sna_render_composite_redirect_done(sna, &tmp);
2580	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2581	return true;
2582
2583fallback_tiled_dst:
2584	if (tmp.redirect.real_bo)
2585		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2586fallback_tiled:
2587	if (sna_blt_compare_depth(src, dst) &&
2588	    sna_blt_copy_boxes(sna, alu,
2589			       src_bo, src_dx, src_dy,
2590			       dst_bo, dst_dx, dst_dy,
2591			       dst->bitsPerPixel,
2592			       box, n))
2593		return true;
2594
2595	return sna_tiling_copy_boxes(sna, alu,
2596				     src, src_bo, src_dx, src_dy,
2597				     dst, dst_bo, dst_dx, dst_dy,
2598				     box, n);
2599}
2600
2601static void
2602gen4_render_copy_blt(struct sna *sna,
2603		     const struct sna_copy_op *op,
2604		     int16_t sx, int16_t sy,
2605		     int16_t w,  int16_t h,
2606		     int16_t dx, int16_t dy)
2607{
2608	gen4_render_copy_one(sna, &op->base, sx, sy, w, h, dx, dy);
2609}
2610
2611static void
2612gen4_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
2613{
2614	if (sna->render.vertex_offset)
2615		gen4_vertex_flush(sna);
2616}
2617
2618static bool
2619gen4_render_copy(struct sna *sna, uint8_t alu,
2620		 PixmapPtr src, struct kgem_bo *src_bo,
2621		 PixmapPtr dst, struct kgem_bo *dst_bo,
2622		 struct sna_copy_op *op)
2623{
2624	DBG(("%s: src=%ld, dst=%ld, alu=%d\n",
2625	     __FUNCTION__,
2626	     src->drawable.serialNumber,
2627	     dst->drawable.serialNumber,
2628	     alu));
2629
2630	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2631	    sna_blt_copy(sna, alu,
2632			 src_bo, dst_bo,
2633			 dst->drawable.bitsPerPixel,
2634			 op))
2635		return true;
2636
2637	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
2638	    too_large(src->drawable.width, src->drawable.height) ||
2639	    too_large(dst->drawable.width, dst->drawable.height)) {
2640fallback:
2641		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
2642			return false;
2643
2644		return sna_blt_copy(sna, alu, src_bo, dst_bo,
2645				    dst->drawable.bitsPerPixel,
2646				    op);
2647	}
2648
2649	if (dst->drawable.depth == src->drawable.depth) {
2650		op->base.dst.format = sna_render_format_for_depth(dst->drawable.depth);
2651		op->base.src.pict_format = op->base.dst.format;
2652	} else {
2653		op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
2654		op->base.src.pict_format = sna_format_for_depth(src->drawable.depth);
2655	}
2656	if (!gen4_check_format(op->base.src.pict_format))
2657		goto fallback;
2658
2659	op->base.op = alu == GXcopy ? PictOpSrc : PictOpClear;
2660
2661	op->base.dst.pixmap = dst;
2662	op->base.dst.width  = dst->drawable.width;
2663	op->base.dst.height = dst->drawable.height;
2664	op->base.dst.bo = dst_bo;
2665
2666	op->base.src.bo = src_bo;
2667	op->base.src.card_format =
2668		gen4_get_card_format(op->base.src.pict_format);
2669	op->base.src.width  = src->drawable.width;
2670	op->base.src.height = src->drawable.height;
2671	op->base.src.scale[0] = 1.f/src->drawable.width;
2672	op->base.src.scale[1] = 1.f/src->drawable.height;
2673	op->base.src.filter = SAMPLER_FILTER_NEAREST;
2674	op->base.src.repeat = SAMPLER_EXTEND_NONE;
2675
2676	op->base.is_affine = true;
2677	op->base.floats_per_vertex = 3;
2678	op->base.floats_per_rect = 9;
2679	op->base.u.gen4.wm_kernel = WM_KERNEL;
2680	op->base.u.gen4.ve_id = 2;
2681
2682	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2683		kgem_submit(&sna->kgem);
2684		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
2685			goto fallback;
2686	}
2687
2688	if (kgem_bo_is_dirty(src_bo)) {
2689		if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2690		    sna_blt_copy(sna, alu,
2691				 src_bo, dst_bo,
2692				 dst->drawable.bitsPerPixel,
2693				 op))
2694			return true;
2695	}
2696
2697	gen4_align_vertex(sna, &op->base);
2698	gen4_copy_bind_surfaces(sna, &op->base);
2699
2700	op->blt  = gen4_render_copy_blt;
2701	op->done = gen4_render_copy_done;
2702	return true;
2703}
2704
2705static void
2706gen4_render_fill_rectangle(struct sna *sna,
2707			   const struct sna_composite_op *op,
2708			   int x, int y, int w, int h)
2709{
2710	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
2711
2712	OUT_VERTEX(x+w, y+h);
2713	OUT_VERTEX_F(.5);
2714
2715	OUT_VERTEX(x, y+h);
2716	OUT_VERTEX_F(.5);
2717
2718	OUT_VERTEX(x, y);
2719	OUT_VERTEX_F(.5);
2720}
2721
2722static bool
2723gen4_render_fill_boxes(struct sna *sna,
2724		       CARD8 op,
2725		       PictFormat format,
2726		       const xRenderColor *color,
2727		       const DrawableRec *dst, struct kgem_bo *dst_bo,
2728		       const BoxRec *box, int n)
2729{
2730	struct sna_composite_op tmp;
2731	uint32_t pixel;
2732
2733	if (op >= ARRAY_SIZE(gen4_blend_op)) {
2734		DBG(("%s: fallback due to unhandled blend op: %d\n",
2735		     __FUNCTION__, op));
2736		return false;
2737	}
2738
2739	if (op <= PictOpSrc) {
2740		uint8_t alu = GXinvalid;
2741
2742		pixel = 0;
2743		if (op == PictOpClear)
2744			alu = GXclear;
2745		else if (sna_get_pixel_from_rgba(&pixel,
2746						 color->red,
2747						 color->green,
2748						 color->blue,
2749						 color->alpha,
2750						 format))
2751			alu = GXcopy;
2752
2753		if (alu != GXinvalid &&
2754		    sna_blt_fill_boxes(sna, alu,
2755				       dst_bo, dst->bitsPerPixel,
2756				       pixel, box, n))
2757			return true;
2758
2759		if (!gen4_check_dst_format(format))
2760			return false;
2761
2762		if (too_large(dst->width, dst->height))
2763			return sna_tiling_fill_boxes(sna, op, format, color,
2764						     dst, dst_bo, box, n);
2765	}
2766
2767	if (op == PictOpClear) {
2768		pixel = 0;
2769		op = PictOpSrc;
2770	} else if (!sna_get_pixel_from_rgba(&pixel,
2771					    color->red,
2772					    color->green,
2773					    color->blue,
2774					    color->alpha,
2775					    PICT_a8r8g8b8))
2776		return false;
2777
2778	DBG(("%s(%08x x %d)\n", __FUNCTION__, pixel, n));
2779
2780	memset(&tmp, 0, sizeof(tmp));
2781
2782	tmp.op = op;
2783
2784	tmp.dst.pixmap = (PixmapPtr)dst;
2785	tmp.dst.width  = dst->width;
2786	tmp.dst.height = dst->height;
2787	tmp.dst.format = format;
2788	tmp.dst.bo = dst_bo;
2789
2790	sna_render_composite_redirect_init(&tmp);
2791	if (too_large(dst->width, dst->height)) {
2792		BoxRec extents;
2793
2794		boxes_extents(box, n, &extents);
2795		if (!sna_render_composite_redirect(sna, &tmp,
2796						   extents.x1, extents.y1,
2797						   extents.x2 - extents.x1,
2798						   extents.y2 - extents.y1,
2799						   n > 1))
2800			return sna_tiling_fill_boxes(sna, op, format, color,
2801						     dst, dst_bo, box, n);
2802	}
2803
2804	gen4_channel_init_solid(sna, &tmp.src, pixel);
2805
2806	tmp.is_affine = true;
2807	tmp.floats_per_vertex = 2;
2808	tmp.floats_per_rect = 6;
2809	tmp.u.gen4.wm_kernel = WM_KERNEL;
2810	tmp.u.gen4.ve_id = 1;
2811
2812	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2813		kgem_submit(&sna->kgem);
2814		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2815			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2816			return false;
2817		}
2818	}
2819
2820	gen4_align_vertex(sna, &tmp);
2821	gen4_bind_surfaces(sna, &tmp);
2822
2823	do {
2824		gen4_render_fill_rectangle(sna, &tmp,
2825					   box->x1, box->y1,
2826					   box->x2 - box->x1,
2827					   box->y2 - box->y1);
2828		box++;
2829	} while (--n);
2830
2831	gen4_vertex_flush(sna);
2832	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2833	sna_render_composite_redirect_done(sna, &tmp);
2834	return true;
2835}
2836
2837static void
2838gen4_render_fill_op_blt(struct sna *sna, const struct sna_fill_op *op,
2839			int16_t x, int16_t y, int16_t w, int16_t h)
2840{
2841	gen4_render_fill_rectangle(sna, &op->base, x, y, w, h);
2842}
2843
2844fastcall static void
2845gen4_render_fill_op_box(struct sna *sna,
2846			const struct sna_fill_op *op,
2847			const BoxRec *box)
2848{
2849	gen4_render_fill_rectangle(sna, &op->base,
2850				   box->x1, box->y1,
2851				   box->x2-box->x1, box->y2-box->y1);
2852}
2853
2854fastcall static void
2855gen4_render_fill_op_boxes(struct sna *sna,
2856			  const struct sna_fill_op *op,
2857			  const BoxRec *box,
2858			  int nbox)
2859{
2860	do {
2861		gen4_render_fill_rectangle(sna, &op->base,
2862					   box->x1, box->y1,
2863					   box->x2-box->x1, box->y2-box->y1);
2864		box++;
2865	} while (--nbox);
2866}
2867
2868static void
2869gen4_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
2870{
2871	if (sna->render.vertex_offset)
2872		gen4_vertex_flush(sna);
2873	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2874}
2875
2876static bool
2877gen4_render_fill(struct sna *sna, uint8_t alu,
2878		 PixmapPtr dst, struct kgem_bo *dst_bo,
2879		 uint32_t color, unsigned flags,
2880		 struct sna_fill_op *op)
2881{
2882	if (sna_blt_fill(sna, alu,
2883			 dst_bo, dst->drawable.bitsPerPixel,
2884			 color,
2885			 op))
2886		return true;
2887
2888	if (!(alu == GXcopy || alu == GXclear) ||
2889	    too_large(dst->drawable.width, dst->drawable.height))
2890		return sna_blt_fill(sna, alu,
2891				    dst_bo, dst->drawable.bitsPerPixel,
2892				    color,
2893				    op);
2894
2895	if (alu == GXclear)
2896		color = 0;
2897
2898	op->base.op = color == 0 ? PictOpClear : PictOpSrc;
2899
2900	op->base.dst.pixmap = dst;
2901	op->base.dst.width  = dst->drawable.width;
2902	op->base.dst.height = dst->drawable.height;
2903	op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
2904	op->base.dst.bo = dst_bo;
2905	op->base.dst.x = op->base.dst.y = 0;
2906
2907	op->base.need_magic_ca_pass = 0;
2908	op->base.has_component_alpha = 0;
2909
2910	gen4_channel_init_solid(sna, &op->base.src,
2911				sna_rgba_for_color(color,
2912						   dst->drawable.depth));
2913	op->base.mask.bo = NULL;
2914
2915	op->base.is_affine = true;
2916	op->base.floats_per_vertex = 2;
2917	op->base.floats_per_rect = 6;
2918	op->base.u.gen4.wm_kernel = WM_KERNEL;
2919	op->base.u.gen4.ve_id = 1;
2920
2921	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2922		kgem_submit(&sna->kgem);
2923		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2924			kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2925			return false;
2926		}
2927	}
2928
2929	gen4_align_vertex(sna, &op->base);
2930	gen4_bind_surfaces(sna, &op->base);
2931
2932	op->blt   = gen4_render_fill_op_blt;
2933	op->box   = gen4_render_fill_op_box;
2934	op->boxes = gen4_render_fill_op_boxes;
2935	op->points = NULL;
2936	op->done  = gen4_render_fill_op_done;
2937	return true;
2938}
2939
2940static bool
2941gen4_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
2942			     uint32_t color,
2943			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
2944			     uint8_t alu)
2945{
2946	BoxRec box;
2947
2948	box.x1 = x1;
2949	box.y1 = y1;
2950	box.x2 = x2;
2951	box.y2 = y2;
2952
2953	return sna_blt_fill_boxes(sna, alu,
2954				  bo, dst->drawable.bitsPerPixel,
2955				  color, &box, 1);
2956}
2957
2958static bool
2959gen4_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
2960		     uint32_t color,
2961		     int16_t x1, int16_t y1,
2962		     int16_t x2, int16_t y2,
2963		     uint8_t alu)
2964{
2965	struct sna_composite_op tmp;
2966
2967	DBG(("%s: color=%08x\n", __FUNCTION__, color));
2968
2969	if (gen4_render_fill_one_try_blt(sna, dst, bo, color,
2970					 x1, y1, x2, y2, alu))
2971		return true;
2972
2973	/* Must use the BLT if we can't RENDER... */
2974	if (!(alu == GXcopy || alu == GXclear) ||
2975	    too_large(dst->drawable.width, dst->drawable.height))
2976		return false;
2977
2978	if (alu == GXclear)
2979		color = 0;
2980
2981	tmp.op = color == 0 ? PictOpClear : PictOpSrc;
2982
2983	tmp.dst.pixmap = dst;
2984	tmp.dst.width  = dst->drawable.width;
2985	tmp.dst.height = dst->drawable.height;
2986	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
2987	tmp.dst.bo = bo;
2988	tmp.dst.x = tmp.dst.y = 0;
2989
2990	gen4_channel_init_solid(sna, &tmp.src,
2991				sna_rgba_for_color(color,
2992						   dst->drawable.depth));
2993	tmp.mask.bo = NULL;
2994	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
2995	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
2996
2997	tmp.is_affine = true;
2998	tmp.floats_per_vertex = 2;
2999	tmp.floats_per_rect = 6;
3000	tmp.has_component_alpha = false;
3001	tmp.need_magic_ca_pass = false;
3002
3003	tmp.u.gen4.wm_kernel = WM_KERNEL;
3004	tmp.u.gen4.ve_id = 1;
3005
3006	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3007		kgem_submit(&sna->kgem);
3008		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3009			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3010			return false;
3011		}
3012	}
3013
3014	gen4_align_vertex(sna, &tmp);
3015	gen4_bind_surfaces(sna, &tmp);
3016
3017	gen4_render_fill_rectangle(sna, &tmp, x1, y1, x2 - x1, y2 - y1);
3018
3019	gen4_vertex_flush(sna);
3020	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3021
3022	return true;
3023}
3024
3025static void gen4_render_reset(struct sna *sna)
3026{
3027	sna->render_state.gen4.needs_invariant = true;
3028	sna->render_state.gen4.needs_urb = true;
3029	sna->render_state.gen4.ve_id = -1;
3030	sna->render_state.gen4.last_primitive = -1;
3031	sna->render_state.gen4.last_pipelined_pointers = -1;
3032
3033	sna->render_state.gen4.drawrect_offset = -1;
3034	sna->render_state.gen4.drawrect_limit = -1;
3035	sna->render_state.gen4.surface_table = 0;
3036
3037	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
3038		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
3039		discard_vbo(sna);
3040	}
3041
3042	sna->render.vertex_offset = 0;
3043	sna->render.nvertex_reloc = 0;
3044	sna->render.vb_id = 0;
3045}
3046
3047static void gen4_render_fini(struct sna *sna)
3048{
3049	kgem_bo_destroy(&sna->kgem, sna->render_state.gen4.general_bo);
3050}
3051
3052static uint32_t gen4_create_vs_unit_state(struct sna_static_stream *stream)
3053{
3054	struct gen4_vs_unit_state *vs = sna_static_stream_map(stream, sizeof(*vs), 32);
3055
3056	/* Set up the vertex shader to be disabled (passthrough) */
3057	vs->thread4.nr_urb_entries = URB_VS_ENTRIES;
3058	vs->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
3059	vs->vs6.vs_enable = 0;
3060	vs->vs6.vert_cache_disable = 1;
3061
3062	return sna_static_stream_offsetof(stream, vs);
3063}
3064
3065static uint32_t gen4_create_sf_state(struct sna_static_stream *stream,
3066				     uint32_t kernel)
3067{
3068	struct gen4_sf_unit_state *sf;
3069
3070	sf = sna_static_stream_map(stream, sizeof(*sf), 32);
3071
3072	sf->thread0.grf_reg_count = GEN4_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
3073	sf->thread0.kernel_start_pointer = kernel >> 6;
3074	sf->thread3.const_urb_entry_read_length = 0;	/* no const URBs */
3075	sf->thread3.const_urb_entry_read_offset = 0;	/* no const URBs */
3076	sf->thread3.urb_entry_read_length = 1;	/* 1 URB per vertex */
3077	/* don't smash vertex header, read start from dw8 */
3078	sf->thread3.urb_entry_read_offset = 1;
3079	sf->thread3.dispatch_grf_start_reg = 3;
3080	sf->thread4.max_threads = GEN4_MAX_SF_THREADS - 1;
3081	sf->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
3082	sf->thread4.nr_urb_entries = URB_SF_ENTRIES;
3083	sf->sf5.viewport_transform = false;	/* skip viewport */
3084	sf->sf6.cull_mode = GEN4_CULLMODE_NONE;
3085	sf->sf6.scissor = 0;
3086	sf->sf7.trifan_pv = 2;
3087	sf->sf6.dest_org_vbias = 0x8;
3088	sf->sf6.dest_org_hbias = 0x8;
3089
3090	return sna_static_stream_offsetof(stream, sf);
3091}
3092
3093static uint32_t gen4_create_sampler_state(struct sna_static_stream *stream,
3094					  sampler_filter_t src_filter,
3095					  sampler_extend_t src_extend,
3096					  sampler_filter_t mask_filter,
3097					  sampler_extend_t mask_extend)
3098{
3099	struct gen4_sampler_state *sampler_state;
3100
3101	sampler_state = sna_static_stream_map(stream,
3102					      sizeof(struct gen4_sampler_state) * 2,
3103					      32);
3104	sampler_state_init(&sampler_state[0], src_filter, src_extend);
3105	sampler_state_init(&sampler_state[1], mask_filter, mask_extend);
3106
3107	return sna_static_stream_offsetof(stream, sampler_state);
3108}
3109
3110static void gen4_init_wm_state(struct gen4_wm_unit_state *wm,
3111			       int gen,
3112			       bool has_mask,
3113			       uint32_t kernel,
3114			       uint32_t sampler)
3115{
3116	assert((kernel & 63) == 0);
3117	wm->thread0.kernel_start_pointer = kernel >> 6;
3118	wm->thread0.grf_reg_count = GEN4_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
3119
3120	wm->thread1.single_program_flow = 0;
3121
3122	wm->thread3.const_urb_entry_read_length = 0;
3123	wm->thread3.const_urb_entry_read_offset = 0;
3124
3125	wm->thread3.urb_entry_read_offset = 0;
3126	wm->thread3.dispatch_grf_start_reg = 3;
3127
3128	assert((sampler & 31) == 0);
3129	wm->wm4.sampler_state_pointer = sampler >> 5;
3130	wm->wm4.sampler_count = 1;
3131
3132	wm->wm5.max_threads = gen >= 045 ? G4X_MAX_WM_THREADS - 1 : GEN4_MAX_WM_THREADS - 1;
3133	wm->wm5.transposed_urb_read = 0;
3134	wm->wm5.thread_dispatch_enable = 1;
3135	/* just use 16-pixel dispatch (4 subspans), don't need to change kernel
3136	 * start point
3137	 */
3138	wm->wm5.enable_16_pix = 1;
3139	wm->wm5.enable_8_pix = 0;
3140	wm->wm5.early_depth_test = 1;
3141
3142	/* Each pair of attributes (src/mask coords) is two URB entries */
3143	if (has_mask) {
3144		wm->thread1.binding_table_entry_count = 3;
3145		wm->thread3.urb_entry_read_length = 4;
3146	} else {
3147		wm->thread1.binding_table_entry_count = 2;
3148		wm->thread3.urb_entry_read_length = 2;
3149	}
3150}
3151
3152static uint32_t gen4_create_cc_unit_state(struct sna_static_stream *stream)
3153{
3154	uint8_t *ptr, *base;
3155	int i, j;
3156
3157	base = ptr =
3158		sna_static_stream_map(stream,
3159				      GEN4_BLENDFACTOR_COUNT*GEN4_BLENDFACTOR_COUNT*64,
3160				      64);
3161
3162	for (i = 0; i < GEN4_BLENDFACTOR_COUNT; i++) {
3163		for (j = 0; j < GEN4_BLENDFACTOR_COUNT; j++) {
3164			struct gen4_cc_unit_state *state =
3165				(struct gen4_cc_unit_state *)ptr;
3166
3167			state->cc3.blend_enable =
3168				!(j == GEN4_BLENDFACTOR_ZERO && i == GEN4_BLENDFACTOR_ONE);
3169
3170			state->cc5.logicop_func = 0xc;	/* COPY */
3171			state->cc5.ia_blend_function = GEN4_BLENDFUNCTION_ADD;
3172
3173			/* Fill in alpha blend factors same as color, for the future. */
3174			state->cc5.ia_src_blend_factor = i;
3175			state->cc5.ia_dest_blend_factor = j;
3176
3177			state->cc6.blend_function = GEN4_BLENDFUNCTION_ADD;
3178			state->cc6.clamp_post_alpha_blend = 1;
3179			state->cc6.clamp_pre_alpha_blend = 1;
3180			state->cc6.src_blend_factor = i;
3181			state->cc6.dest_blend_factor = j;
3182
3183			ptr += 64;
3184		}
3185	}
3186
3187	return sna_static_stream_offsetof(stream, base);
3188}
3189
3190static bool gen4_render_setup(struct sna *sna)
3191{
3192	struct gen4_render_state *state = &sna->render_state.gen4;
3193	struct sna_static_stream general;
3194	struct gen4_wm_unit_state_padded *wm_state;
3195	uint32_t sf, wm[KERNEL_COUNT];
3196	int i, j, k, l, m;
3197
3198	sna_static_stream_init(&general);
3199
3200	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
3201	 * dumps, you know it points to zero.
3202	 */
3203	null_create(&general);
3204
3205	sf = sna_static_stream_compile_sf(sna, &general, brw_sf_kernel__mask);
3206	for (m = 0; m < KERNEL_COUNT; m++) {
3207		if (wm_kernels[m].size) {
3208			wm[m] = sna_static_stream_add(&general,
3209						      wm_kernels[m].data,
3210						      wm_kernels[m].size,
3211						      64);
3212		} else {
3213			wm[m] = sna_static_stream_compile_wm(sna, &general,
3214							     wm_kernels[m].data,
3215							     16);
3216		}
3217	}
3218
3219	state->vs = gen4_create_vs_unit_state(&general);
3220	state->sf = gen4_create_sf_state(&general, sf);
3221
3222	wm_state = sna_static_stream_map(&general,
3223					  sizeof(*wm_state) * KERNEL_COUNT *
3224					  FILTER_COUNT * EXTEND_COUNT *
3225					  FILTER_COUNT * EXTEND_COUNT,
3226					  64);
3227	state->wm = sna_static_stream_offsetof(&general, wm_state);
3228	for (i = 0; i < FILTER_COUNT; i++) {
3229		for (j = 0; j < EXTEND_COUNT; j++) {
3230			for (k = 0; k < FILTER_COUNT; k++) {
3231				for (l = 0; l < EXTEND_COUNT; l++) {
3232					uint32_t sampler_state;
3233
3234					sampler_state =
3235						gen4_create_sampler_state(&general,
3236									  i, j,
3237									  k, l);
3238
3239					for (m = 0; m < KERNEL_COUNT; m++) {
3240						gen4_init_wm_state(&wm_state->state,
3241								   sna->kgem.gen,
3242								   wm_kernels[m].has_mask,
3243								   wm[m], sampler_state);
3244						wm_state++;
3245					}
3246				}
3247			}
3248		}
3249	}
3250
3251	state->cc = gen4_create_cc_unit_state(&general);
3252
3253	state->general_bo = sna_static_stream_fini(sna, &general);
3254	return state->general_bo != NULL;
3255}
3256
3257const char *gen4_render_init(struct sna *sna, const char *backend)
3258{
3259	if (!gen4_render_setup(sna))
3260		return backend;
3261
3262	sna->kgem.retire = gen4_render_retire;
3263	sna->kgem.expire = gen4_render_expire;
3264
3265#if !NO_COMPOSITE
3266	sna->render.composite = gen4_render_composite;
3267	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
3268#endif
3269#if !NO_COMPOSITE_SPANS
3270	sna->render.check_composite_spans = gen4_check_composite_spans;
3271	sna->render.composite_spans = gen4_render_composite_spans;
3272	if (0)
3273		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
3274#endif
3275
3276#if !NO_VIDEO
3277	sna->render.video = gen4_render_video;
3278#endif
3279
3280#if !NO_COPY_BOXES
3281	sna->render.copy_boxes = gen4_render_copy_boxes;
3282#endif
3283#if !NO_COPY
3284	sna->render.copy = gen4_render_copy;
3285#endif
3286
3287#if !NO_FILL_BOXES
3288	sna->render.fill_boxes = gen4_render_fill_boxes;
3289#endif
3290#if !NO_FILL
3291	sna->render.fill = gen4_render_fill;
3292#endif
3293#if !NO_FILL_ONE
3294	sna->render.fill_one = gen4_render_fill_one;
3295#endif
3296
3297	sna->render.flush = gen4_render_flush;
3298	sna->render.reset = gen4_render_reset;
3299	sna->render.fini = gen4_render_fini;
3300
3301	sna->render.max_3d_size = GEN4_MAX_3D_SIZE;
3302	sna->render.max_3d_pitch = 1 << 18;
3303	return sna->kgem.gen >= 045 ? "Eaglelake (gen4.5)" : "Broadwater (gen4)";
3304}
3305