gen5_render.c revision fe8aea9e
1/*
2 * Copyright © 2006,2008,2011 Intel Corporation
3 * Copyright © 2007 Red Hat, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 *    Wang Zhenyu <zhenyu.z.wang@sna.com>
26 *    Eric Anholt <eric@anholt.net>
27 *    Carl Worth <cworth@redhat.com>
28 *    Keith Packard <keithp@keithp.com>
29 *    Chris Wilson <chris@chris-wilson.co.uk>
30 *
31 */
32
33#ifdef HAVE_CONFIG_H
34#include "config.h"
35#endif
36
37#include "sna.h"
38#include "sna_reg.h"
39#include "sna_render.h"
40#include "sna_render_inline.h"
41#include "sna_video.h"
42
43#include "brw/brw.h"
44#include "gen5_render.h"
45#include "gen4_common.h"
46#include "gen4_source.h"
47#include "gen4_vertex.h"
48
49#define NO_COMPOSITE 0
50#define NO_COMPOSITE_SPANS 0
51
52#define PREFER_BLT_FILL 1
53
54#define DBG_NO_STATE_CACHE 0
55#define DBG_NO_SURFACE_CACHE 0
56
57#define ALWAYS_FLUSH 0
58
59#define MAX_3D_SIZE 8192
60
61#define GEN5_GRF_BLOCKS(nreg)    ((nreg + 15) / 16 - 1)
62
63/* Set up a default static partitioning of the URB, which is supposed to
64 * allow anything we would want to do, at potentially lower performance.
65 */
66#define URB_CS_ENTRY_SIZE     1
67#define URB_CS_ENTRIES	      0
68
69#define URB_VS_ENTRY_SIZE     1
70#define URB_VS_ENTRIES	      256 /* minimum of 8 */
71
72#define URB_GS_ENTRY_SIZE     0
73#define URB_GS_ENTRIES	      0
74
75#define URB_CLIP_ENTRY_SIZE   0
76#define URB_CLIP_ENTRIES      0
77
78#define URB_SF_ENTRY_SIZE     2
79#define URB_SF_ENTRIES	      64
80
81/*
82 * this program computes dA/dx and dA/dy for the texture coordinates along
83 * with the base texture coordinate. It was extracted from the Mesa driver
84 */
85
86#define SF_KERNEL_NUM_GRF  16
87#define SF_MAX_THREADS	   48
88
89#define PS_KERNEL_NUM_GRF   32
90#define PS_MAX_THREADS	    72
91
92static const uint32_t ps_kernel_packed_bt601_static[][4] = {
93#include "exa_wm_xy.g5b"
94#include "exa_wm_src_affine.g5b"
95#include "exa_wm_src_sample_argb.g5b"
96#include "exa_wm_yuv_rgb_bt601.g5b"
97#include "exa_wm_write.g5b"
98};
99
100static const uint32_t ps_kernel_planar_bt601_static[][4] = {
101#include "exa_wm_xy.g5b"
102#include "exa_wm_src_affine.g5b"
103#include "exa_wm_src_sample_planar.g5b"
104#include "exa_wm_yuv_rgb_bt601.g5b"
105#include "exa_wm_write.g5b"
106};
107
108static const uint32_t ps_kernel_nv12_bt601_static[][4] = {
109#include "exa_wm_xy.g5b"
110#include "exa_wm_src_affine.g5b"
111#include "exa_wm_src_sample_nv12.g5b"
112#include "exa_wm_yuv_rgb_bt601.g5b"
113#include "exa_wm_write.g5b"
114};
115
116static const uint32_t ps_kernel_packed_bt709_static[][4] = {
117#include "exa_wm_xy.g5b"
118#include "exa_wm_src_affine.g5b"
119#include "exa_wm_src_sample_argb.g5b"
120#include "exa_wm_yuv_rgb_bt709.g5b"
121#include "exa_wm_write.g5b"
122};
123
124static const uint32_t ps_kernel_planar_bt709_static[][4] = {
125#include "exa_wm_xy.g5b"
126#include "exa_wm_src_affine.g5b"
127#include "exa_wm_src_sample_planar.g5b"
128#include "exa_wm_yuv_rgb_bt709.g5b"
129#include "exa_wm_write.g5b"
130};
131
132static const uint32_t ps_kernel_nv12_bt709_static[][4] = {
133#include "exa_wm_xy.g5b"
134#include "exa_wm_src_affine.g5b"
135#include "exa_wm_src_sample_nv12.g5b"
136#include "exa_wm_yuv_rgb_bt709.g5b"
137#include "exa_wm_write.g5b"
138};
139
140#define NOKERNEL(kernel_enum, func, masked) \
141    [kernel_enum] = {func, 0, masked}
142#define KERNEL(kernel_enum, kernel, masked) \
143    [kernel_enum] = {&kernel, sizeof(kernel), masked}
144static const struct wm_kernel_info {
145	const void *data;
146	unsigned int size;
147	bool has_mask;
148} wm_kernels[] = {
149	NOKERNEL(WM_KERNEL, brw_wm_kernel__affine, false),
150	NOKERNEL(WM_KERNEL_P, brw_wm_kernel__projective, false),
151
152	NOKERNEL(WM_KERNEL_MASK, brw_wm_kernel__affine_mask, true),
153	NOKERNEL(WM_KERNEL_MASK_P, brw_wm_kernel__projective_mask, true),
154
155	NOKERNEL(WM_KERNEL_MASKCA, brw_wm_kernel__affine_mask_ca, true),
156	NOKERNEL(WM_KERNEL_MASKCA_P, brw_wm_kernel__projective_mask_ca, true),
157
158	NOKERNEL(WM_KERNEL_MASKSA, brw_wm_kernel__affine_mask_sa, true),
159	NOKERNEL(WM_KERNEL_MASKSA_P, brw_wm_kernel__projective_mask_sa, true),
160
161	NOKERNEL(WM_KERNEL_OPACITY, brw_wm_kernel__affine_opacity, true),
162	NOKERNEL(WM_KERNEL_OPACITY_P, brw_wm_kernel__projective_opacity, true),
163
164	KERNEL(WM_KERNEL_VIDEO_PLANAR_BT601, ps_kernel_planar_bt601_static, false),
165	KERNEL(WM_KERNEL_VIDEO_NV12_BT601, ps_kernel_nv12_bt601_static, false),
166	KERNEL(WM_KERNEL_VIDEO_PACKED_BT601, ps_kernel_packed_bt601_static, false),
167
168	KERNEL(WM_KERNEL_VIDEO_PLANAR_BT709, ps_kernel_planar_bt709_static, false),
169	KERNEL(WM_KERNEL_VIDEO_NV12_BT709, ps_kernel_nv12_bt709_static, false),
170	KERNEL(WM_KERNEL_VIDEO_PACKED_BT709, ps_kernel_packed_bt709_static, false),
171};
172#undef KERNEL
173
174static const struct blendinfo {
175	bool src_alpha;
176	uint32_t src_blend;
177	uint32_t dst_blend;
178} gen5_blend_op[] = {
179	/* Clear */	{0, GEN5_BLENDFACTOR_ZERO, GEN5_BLENDFACTOR_ZERO},
180	/* Src */	{0, GEN5_BLENDFACTOR_ONE, GEN5_BLENDFACTOR_ZERO},
181	/* Dst */	{0, GEN5_BLENDFACTOR_ZERO, GEN5_BLENDFACTOR_ONE},
182	/* Over */	{1, GEN5_BLENDFACTOR_ONE, GEN5_BLENDFACTOR_INV_SRC_ALPHA},
183	/* OverReverse */ {0, GEN5_BLENDFACTOR_INV_DST_ALPHA, GEN5_BLENDFACTOR_ONE},
184	/* In */	{0, GEN5_BLENDFACTOR_DST_ALPHA, GEN5_BLENDFACTOR_ZERO},
185	/* InReverse */	{1, GEN5_BLENDFACTOR_ZERO, GEN5_BLENDFACTOR_SRC_ALPHA},
186	/* Out */	{0, GEN5_BLENDFACTOR_INV_DST_ALPHA, GEN5_BLENDFACTOR_ZERO},
187	/* OutReverse */ {1, GEN5_BLENDFACTOR_ZERO, GEN5_BLENDFACTOR_INV_SRC_ALPHA},
188	/* Atop */	{1, GEN5_BLENDFACTOR_DST_ALPHA, GEN5_BLENDFACTOR_INV_SRC_ALPHA},
189	/* AtopReverse */ {1, GEN5_BLENDFACTOR_INV_DST_ALPHA, GEN5_BLENDFACTOR_SRC_ALPHA},
190	/* Xor */	{1, GEN5_BLENDFACTOR_INV_DST_ALPHA, GEN5_BLENDFACTOR_INV_SRC_ALPHA},
191	/* Add */	{0, GEN5_BLENDFACTOR_ONE, GEN5_BLENDFACTOR_ONE},
192};
193
194/**
195 * Highest-valued BLENDFACTOR used in gen5_blend_op.
196 *
197 * This leaves out GEN5_BLENDFACTOR_INV_DST_COLOR,
198 * GEN5_BLENDFACTOR_INV_CONST_{COLOR,ALPHA},
199 * GEN5_BLENDFACTOR_INV_SRC1_{COLOR,ALPHA}
200 */
201#define GEN5_BLENDFACTOR_COUNT (GEN5_BLENDFACTOR_INV_DST_ALPHA + 1)
202
203#define BLEND_OFFSET(s, d) \
204	(((s) * GEN5_BLENDFACTOR_COUNT + (d)) * 64)
205
206#define SAMPLER_OFFSET(sf, se, mf, me, k) \
207	((((((sf) * EXTEND_COUNT + (se)) * FILTER_COUNT + (mf)) * EXTEND_COUNT + (me)) * KERNEL_COUNT + (k)) * 64)
208
209static bool
210gen5_emit_pipelined_pointers(struct sna *sna,
211			     const struct sna_composite_op *op,
212			     int blend, int kernel);
213
214#define OUT_BATCH(v) batch_emit(sna, v)
215#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
216#define OUT_VERTEX_F(v) vertex_emit(sna, v)
217
218static inline bool too_large(int width, int height)
219{
220	return width > MAX_3D_SIZE || height > MAX_3D_SIZE;
221}
222
223static int
224gen5_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
225{
226	int base;
227
228	if (has_mask) {
229		if (is_ca) {
230			if (gen5_blend_op[op].src_alpha)
231				base = WM_KERNEL_MASKSA;
232			else
233				base = WM_KERNEL_MASKCA;
234		} else
235			base = WM_KERNEL_MASK;
236	} else
237		base = WM_KERNEL;
238
239	return base + !is_affine;
240}
241
242static bool gen5_magic_ca_pass(struct sna *sna,
243			       const struct sna_composite_op *op)
244{
245	struct gen5_render_state *state = &sna->render_state.gen5;
246
247	if (!op->need_magic_ca_pass)
248		return false;
249
250	assert(sna->render.vertex_index > sna->render.vertex_start);
251
252	DBG(("%s: CA fixup\n", __FUNCTION__));
253	assert(op->mask.bo != NULL);
254	assert(op->has_component_alpha);
255
256	gen5_emit_pipelined_pointers
257		(sna, op, PictOpAdd,
258		 gen5_choose_composite_kernel(PictOpAdd,
259					      true, true, op->is_affine));
260
261	OUT_BATCH(GEN5_3DPRIMITIVE |
262		  GEN5_3DPRIMITIVE_VERTEX_SEQUENTIAL |
263		  (_3DPRIM_RECTLIST << GEN5_3DPRIMITIVE_TOPOLOGY_SHIFT) |
264		  (0 << 9) |
265		  4);
266	OUT_BATCH(sna->render.vertex_index - sna->render.vertex_start);
267	OUT_BATCH(sna->render.vertex_start);
268	OUT_BATCH(1);	/* single instance */
269	OUT_BATCH(0);	/* start instance location */
270	OUT_BATCH(0);	/* index buffer offset, ignored */
271
272	state->last_primitive = sna->kgem.nbatch;
273	return true;
274}
275
276static uint32_t gen5_get_blend(int op,
277			       bool has_component_alpha,
278			       uint32_t dst_format)
279{
280	uint32_t src, dst;
281
282	src = gen5_blend_op[op].src_blend;
283	dst = gen5_blend_op[op].dst_blend;
284
285	/* If there's no dst alpha channel, adjust the blend op so that we'll treat
286	 * it as always 1.
287	 */
288	if (PICT_FORMAT_A(dst_format) == 0) {
289		if (src == GEN5_BLENDFACTOR_DST_ALPHA)
290			src = GEN5_BLENDFACTOR_ONE;
291		else if (src == GEN5_BLENDFACTOR_INV_DST_ALPHA)
292			src = GEN5_BLENDFACTOR_ZERO;
293	}
294
295	/* If the source alpha is being used, then we should only be in a
296	 * case where the source blend factor is 0, and the source blend
297	 * value is the mask channels multiplied by the source picture's alpha.
298	 */
299	if (has_component_alpha && gen5_blend_op[op].src_alpha) {
300		if (dst == GEN5_BLENDFACTOR_SRC_ALPHA)
301			dst = GEN5_BLENDFACTOR_SRC_COLOR;
302		else if (dst == GEN5_BLENDFACTOR_INV_SRC_ALPHA)
303			dst = GEN5_BLENDFACTOR_INV_SRC_COLOR;
304	}
305
306	DBG(("blend op=%d, dst=%x [A=%d] => src=%d, dst=%d => offset=%x\n",
307	     op, dst_format, PICT_FORMAT_A(dst_format),
308	     src, dst, BLEND_OFFSET(src, dst)));
309	return BLEND_OFFSET(src, dst);
310}
311
312static uint32_t gen5_get_card_format(PictFormat format)
313{
314	switch (format) {
315	default:
316		return -1;
317	case PICT_a8r8g8b8:
318		return GEN5_SURFACEFORMAT_B8G8R8A8_UNORM;
319	case PICT_x8r8g8b8:
320		return GEN5_SURFACEFORMAT_B8G8R8X8_UNORM;
321	case PICT_a8b8g8r8:
322		return GEN5_SURFACEFORMAT_R8G8B8A8_UNORM;
323	case PICT_x8b8g8r8:
324		return GEN5_SURFACEFORMAT_R8G8B8X8_UNORM;
325#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
326	case PICT_a2r10g10b10:
327		return GEN5_SURFACEFORMAT_B10G10R10A2_UNORM;
328	case PICT_x2r10g10b10:
329		return GEN5_SURFACEFORMAT_B10G10R10X2_UNORM;
330#endif
331	case PICT_r8g8b8:
332		return GEN5_SURFACEFORMAT_R8G8B8_UNORM;
333	case PICT_r5g6b5:
334		return GEN5_SURFACEFORMAT_B5G6R5_UNORM;
335	case PICT_a1r5g5b5:
336		return GEN5_SURFACEFORMAT_B5G5R5A1_UNORM;
337	case PICT_a8:
338		return GEN5_SURFACEFORMAT_A8_UNORM;
339	case PICT_a4r4g4b4:
340		return GEN5_SURFACEFORMAT_B4G4R4A4_UNORM;
341	}
342}
343
344static uint32_t gen5_get_dest_format(PictFormat format)
345{
346	switch (format) {
347	default:
348		return -1;
349	case PICT_a8r8g8b8:
350	case PICT_x8r8g8b8:
351		return GEN5_SURFACEFORMAT_B8G8R8A8_UNORM;
352	case PICT_a8b8g8r8:
353	case PICT_x8b8g8r8:
354		return GEN5_SURFACEFORMAT_R8G8B8A8_UNORM;
355#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
356	case PICT_a2r10g10b10:
357	case PICT_x2r10g10b10:
358		return GEN5_SURFACEFORMAT_B10G10R10A2_UNORM;
359#endif
360	case PICT_r5g6b5:
361		return GEN5_SURFACEFORMAT_B5G6R5_UNORM;
362	case PICT_x1r5g5b5:
363	case PICT_a1r5g5b5:
364		return GEN5_SURFACEFORMAT_B5G5R5A1_UNORM;
365	case PICT_a8:
366		return GEN5_SURFACEFORMAT_A8_UNORM;
367	case PICT_a4r4g4b4:
368	case PICT_x4r4g4b4:
369		return GEN5_SURFACEFORMAT_B4G4R4A4_UNORM;
370	}
371}
372
373static bool gen5_check_dst_format(PictFormat format)
374{
375	if (gen5_get_dest_format(format) != -1)
376		return true;
377
378	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
379	return false;
380}
381
382static bool gen5_check_format(uint32_t format)
383{
384	if (gen5_get_card_format(format) != -1)
385		return true;
386
387	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
388	return false;
389}
390
391typedef struct gen5_surface_state_padded {
392	struct gen5_surface_state state;
393	char pad[32 - sizeof(struct gen5_surface_state)];
394} gen5_surface_state_padded;
395
396static void null_create(struct sna_static_stream *stream)
397{
398	/* A bunch of zeros useful for legacy border color and depth-stencil */
399	sna_static_stream_map(stream, 64, 64);
400}
401
402static void
403sampler_state_init(struct gen5_sampler_state *sampler_state,
404		   sampler_filter_t filter,
405		   sampler_extend_t extend)
406{
407	sampler_state->ss0.lod_preclamp = 1;	/* GL mode */
408
409	/* We use the legacy mode to get the semantics specified by
410	 * the Render extension. */
411	sampler_state->ss0.border_color_mode = GEN5_BORDER_COLOR_MODE_LEGACY;
412
413	switch (filter) {
414	default:
415	case SAMPLER_FILTER_NEAREST:
416		sampler_state->ss0.min_filter = GEN5_MAPFILTER_NEAREST;
417		sampler_state->ss0.mag_filter = GEN5_MAPFILTER_NEAREST;
418		break;
419	case SAMPLER_FILTER_BILINEAR:
420		sampler_state->ss0.min_filter = GEN5_MAPFILTER_LINEAR;
421		sampler_state->ss0.mag_filter = GEN5_MAPFILTER_LINEAR;
422		break;
423	}
424
425	switch (extend) {
426	default:
427	case SAMPLER_EXTEND_NONE:
428		sampler_state->ss1.r_wrap_mode = GEN5_TEXCOORDMODE_CLAMP_BORDER;
429		sampler_state->ss1.s_wrap_mode = GEN5_TEXCOORDMODE_CLAMP_BORDER;
430		sampler_state->ss1.t_wrap_mode = GEN5_TEXCOORDMODE_CLAMP_BORDER;
431		break;
432	case SAMPLER_EXTEND_REPEAT:
433		sampler_state->ss1.r_wrap_mode = GEN5_TEXCOORDMODE_WRAP;
434		sampler_state->ss1.s_wrap_mode = GEN5_TEXCOORDMODE_WRAP;
435		sampler_state->ss1.t_wrap_mode = GEN5_TEXCOORDMODE_WRAP;
436		break;
437	case SAMPLER_EXTEND_PAD:
438		sampler_state->ss1.r_wrap_mode = GEN5_TEXCOORDMODE_CLAMP;
439		sampler_state->ss1.s_wrap_mode = GEN5_TEXCOORDMODE_CLAMP;
440		sampler_state->ss1.t_wrap_mode = GEN5_TEXCOORDMODE_CLAMP;
441		break;
442	case SAMPLER_EXTEND_REFLECT:
443		sampler_state->ss1.r_wrap_mode = GEN5_TEXCOORDMODE_MIRROR;
444		sampler_state->ss1.s_wrap_mode = GEN5_TEXCOORDMODE_MIRROR;
445		sampler_state->ss1.t_wrap_mode = GEN5_TEXCOORDMODE_MIRROR;
446		break;
447	}
448}
449
450static uint32_t gen5_filter(uint32_t filter)
451{
452	switch (filter) {
453	default:
454		assert(0);
455	case PictFilterNearest:
456		return SAMPLER_FILTER_NEAREST;
457	case PictFilterBilinear:
458		return SAMPLER_FILTER_BILINEAR;
459	}
460}
461
462static uint32_t gen5_check_filter(PicturePtr picture)
463{
464	switch (picture->filter) {
465	case PictFilterNearest:
466	case PictFilterBilinear:
467		return true;
468	default:
469		DBG(("%s: unknown filter: %x\n", __FUNCTION__, picture->filter));
470		return false;
471	}
472}
473
474static uint32_t gen5_repeat(uint32_t repeat)
475{
476	switch (repeat) {
477	default:
478		assert(0);
479	case RepeatNone:
480		return SAMPLER_EXTEND_NONE;
481	case RepeatNormal:
482		return SAMPLER_EXTEND_REPEAT;
483	case RepeatPad:
484		return SAMPLER_EXTEND_PAD;
485	case RepeatReflect:
486		return SAMPLER_EXTEND_REFLECT;
487	}
488}
489
490static bool gen5_check_repeat(PicturePtr picture)
491{
492	if (!picture->repeat)
493		return true;
494
495	switch (picture->repeatType) {
496	case RepeatNone:
497	case RepeatNormal:
498	case RepeatPad:
499	case RepeatReflect:
500		return true;
501	default:
502		DBG(("%s: unknown repeat: %x\n",
503		     __FUNCTION__, picture->repeatType));
504		return false;
505	}
506}
507
508static uint32_t
509gen5_tiling_bits(uint32_t tiling)
510{
511	switch (tiling) {
512	default: assert(0);
513	case I915_TILING_NONE: return 0;
514	case I915_TILING_X: return GEN5_SURFACE_TILED;
515	case I915_TILING_Y: return GEN5_SURFACE_TILED | GEN5_SURFACE_TILED_Y;
516	}
517}
518
519/**
520 * Sets up the common fields for a surface state buffer for the given
521 * picture in the given surface state buffer.
522 */
523static uint32_t
524gen5_bind_bo(struct sna *sna,
525	     struct kgem_bo *bo,
526	     uint32_t width,
527	     uint32_t height,
528	     uint32_t format,
529	     bool is_dst)
530{
531	uint32_t domains;
532	uint16_t offset;
533	uint32_t *ss;
534
535	/* After the first bind, we manage the cache domains within the batch */
536	if (!DBG_NO_SURFACE_CACHE) {
537		offset = kgem_bo_get_binding(bo, format | is_dst << 31);
538		if (offset) {
539			if (is_dst)
540				kgem_bo_mark_dirty(bo);
541			assert(offset >= sna->kgem.surface);
542			return offset * sizeof(uint32_t);
543		}
544	}
545
546	offset = sna->kgem.surface -=
547		sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
548	ss = sna->kgem.batch + offset;
549
550	ss[0] = (GEN5_SURFACE_2D << GEN5_SURFACE_TYPE_SHIFT |
551		 GEN5_SURFACE_BLEND_ENABLED |
552		 format << GEN5_SURFACE_FORMAT_SHIFT);
553
554	if (is_dst) {
555		ss[0] |= GEN5_SURFACE_RC_READ_WRITE;
556		domains = I915_GEM_DOMAIN_RENDER << 16 | I915_GEM_DOMAIN_RENDER;
557	} else
558		domains = I915_GEM_DOMAIN_SAMPLER << 16;
559	ss[1] = kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0);
560
561	ss[2] = ((width - 1)  << GEN5_SURFACE_WIDTH_SHIFT |
562		 (height - 1) << GEN5_SURFACE_HEIGHT_SHIFT);
563	ss[3] = (gen5_tiling_bits(bo->tiling) |
564		 (bo->pitch - 1) << GEN5_SURFACE_PITCH_SHIFT);
565	ss[4] = 0;
566	ss[5] = 0;
567
568	kgem_bo_set_binding(bo, format | is_dst << 31, offset);
569
570	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
571	     offset, bo->handle, ss[1],
572	     format, width, height, bo->pitch, bo->tiling,
573	     domains & 0xffff ? "render" : "sampler"));
574
575	return offset * sizeof(uint32_t);
576}
577
578static void gen5_emit_vertex_buffer(struct sna *sna,
579				    const struct sna_composite_op *op)
580{
581	int id = op->u.gen5.ve_id;
582
583	assert((sna->render.vb_id & (1 << id)) == 0);
584
585	OUT_BATCH(GEN5_3DSTATE_VERTEX_BUFFERS | 3);
586	OUT_BATCH(id << VB0_BUFFER_INDEX_SHIFT | VB0_VERTEXDATA |
587		  (4*op->floats_per_vertex << VB0_BUFFER_PITCH_SHIFT));
588	assert(sna->render.nvertex_reloc < ARRAY_SIZE(sna->render.vertex_reloc));
589	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
590	OUT_BATCH(0);
591	OUT_BATCH(~0); /* max address: disabled */
592	OUT_BATCH(0);
593
594	sna->render.vb_id |= 1 << id;
595}
596
597static void gen5_emit_primitive(struct sna *sna)
598{
599	if (sna->kgem.nbatch == sna->render_state.gen5.last_primitive) {
600		sna->render.vertex_offset = sna->kgem.nbatch - 5;
601		return;
602	}
603
604	OUT_BATCH(GEN5_3DPRIMITIVE |
605		  GEN5_3DPRIMITIVE_VERTEX_SEQUENTIAL |
606		  (_3DPRIM_RECTLIST << GEN5_3DPRIMITIVE_TOPOLOGY_SHIFT) |
607		  (0 << 9) |
608		  4);
609	sna->render.vertex_offset = sna->kgem.nbatch;
610	OUT_BATCH(0);	/* vertex count, to be filled in later */
611	OUT_BATCH(sna->render.vertex_index);
612	OUT_BATCH(1);	/* single instance */
613	OUT_BATCH(0);	/* start instance location */
614	OUT_BATCH(0);	/* index buffer offset, ignored */
615	sna->render.vertex_start = sna->render.vertex_index;
616
617	sna->render_state.gen5.last_primitive = sna->kgem.nbatch;
618}
619
620static bool gen5_rectangle_begin(struct sna *sna,
621				 const struct sna_composite_op *op)
622{
623	int id = op->u.gen5.ve_id;
624	int ndwords;
625
626	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
627		return true;
628
629	ndwords = op->need_magic_ca_pass ? 20 : 6;
630	if ((sna->render.vb_id & (1 << id)) == 0)
631		ndwords += 5;
632
633	if (!kgem_check_batch(&sna->kgem, ndwords))
634		return false;
635
636	if ((sna->render.vb_id & (1 << id)) == 0)
637		gen5_emit_vertex_buffer(sna, op);
638	if (sna->render.vertex_offset == 0)
639		gen5_emit_primitive(sna);
640
641	return true;
642}
643
644static int gen5_get_rectangles__flush(struct sna *sna,
645				      const struct sna_composite_op *op)
646{
647	/* Preventing discarding new vbo after lock contention */
648	if (sna_vertex_wait__locked(&sna->render)) {
649		int rem = vertex_space(sna);
650		if (rem > op->floats_per_rect)
651			return rem;
652	}
653
654	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 40 : 6))
655		return 0;
656	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
657		return 0;
658
659	if (sna->render.vertex_offset) {
660		gen4_vertex_flush(sna);
661		if (gen5_magic_ca_pass(sna, op))
662			gen5_emit_pipelined_pointers(sna, op, op->op,
663						     op->u.gen5.wm_kernel);
664	}
665
666	return gen4_vertex_finish(sna);
667}
668
669inline static int gen5_get_rectangles(struct sna *sna,
670				      const struct sna_composite_op *op,
671				      int want,
672				      void (*emit_state)(struct sna *sna,
673							 const struct sna_composite_op *op))
674{
675	int rem;
676
677	assert(want);
678
679start:
680	rem = vertex_space(sna);
681	if (unlikely(rem < op->floats_per_rect)) {
682		DBG(("flushing vbo for %s: %d < %d\n",
683		     __FUNCTION__, rem, op->floats_per_rect));
684		rem = gen5_get_rectangles__flush(sna, op);
685		if (unlikely (rem == 0))
686			goto flush;
687	}
688
689	if (unlikely(sna->render.vertex_offset == 0)) {
690		if (!gen5_rectangle_begin(sna, op))
691			goto flush;
692		else
693			goto start;
694	}
695
696	assert(rem <= vertex_space(sna));
697	assert(op->floats_per_rect <= rem);
698	if (want > 1 && want * op->floats_per_rect > rem)
699		want = rem / op->floats_per_rect;
700
701	sna->render.vertex_index += 3*want;
702	return want;
703
704flush:
705	if (sna->render.vertex_offset) {
706		gen4_vertex_flush(sna);
707		gen5_magic_ca_pass(sna, op);
708	}
709	sna_vertex_wait__locked(&sna->render);
710	_kgem_submit(&sna->kgem);
711	emit_state(sna, op);
712	goto start;
713}
714
715static uint32_t *
716gen5_composite_get_binding_table(struct sna *sna,
717				 uint16_t *offset)
718{
719	sna->kgem.surface -=
720		sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
721
722	DBG(("%s(%x)\n", __FUNCTION__, 4*sna->kgem.surface));
723
724	/* Clear all surplus entries to zero in case of prefetch */
725	*offset = sna->kgem.surface;
726	return memset(sna->kgem.batch + sna->kgem.surface,
727		      0, sizeof(struct gen5_surface_state_padded));
728}
729
730static void
731gen5_emit_urb(struct sna *sna)
732{
733	int urb_vs_start, urb_vs_size;
734	int urb_gs_start, urb_gs_size;
735	int urb_clip_start, urb_clip_size;
736	int urb_sf_start, urb_sf_size;
737	int urb_cs_start, urb_cs_size;
738
739	urb_vs_start = 0;
740	urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
741	urb_gs_start = urb_vs_start + urb_vs_size;
742	urb_gs_size = URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
743	urb_clip_start = urb_gs_start + urb_gs_size;
744	urb_clip_size = URB_CLIP_ENTRIES * URB_CLIP_ENTRY_SIZE;
745	urb_sf_start = urb_clip_start + urb_clip_size;
746	urb_sf_size = URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
747	urb_cs_start = urb_sf_start + urb_sf_size;
748	urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
749
750	OUT_BATCH(GEN5_URB_FENCE |
751		  UF0_CS_REALLOC |
752		  UF0_SF_REALLOC |
753		  UF0_CLIP_REALLOC |
754		  UF0_GS_REALLOC |
755		  UF0_VS_REALLOC |
756		  1);
757	OUT_BATCH(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
758		  ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
759		  ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
760	OUT_BATCH(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
761		  ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
762
763	/* Constant buffer state */
764	OUT_BATCH(GEN5_CS_URB_STATE | 0);
765	OUT_BATCH((URB_CS_ENTRY_SIZE - 1) << 4 | URB_CS_ENTRIES << 0);
766}
767
768static void
769gen5_emit_state_base_address(struct sna *sna)
770{
771	assert(sna->render_state.gen5.general_bo->proxy == NULL);
772	OUT_BATCH(GEN5_STATE_BASE_ADDRESS | 6);
773	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* general */
774				 sna->kgem.nbatch,
775				 sna->render_state.gen5.general_bo,
776				 I915_GEM_DOMAIN_INSTRUCTION << 16,
777				 BASE_ADDRESS_MODIFY));
778	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* surface */
779				 sna->kgem.nbatch,
780				 NULL,
781				 I915_GEM_DOMAIN_INSTRUCTION << 16,
782				 BASE_ADDRESS_MODIFY));
783	OUT_BATCH(0); /* media */
784	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* instruction */
785				 sna->kgem.nbatch,
786				 sna->render_state.gen5.general_bo,
787				 I915_GEM_DOMAIN_INSTRUCTION << 16,
788				 BASE_ADDRESS_MODIFY));
789
790	/* upper bounds, all disabled */
791	OUT_BATCH(BASE_ADDRESS_MODIFY);
792	OUT_BATCH(0);
793	OUT_BATCH(BASE_ADDRESS_MODIFY);
794}
795
796static void
797gen5_emit_invariant(struct sna *sna)
798{
799	/* Ironlake errata workaround: Before disabling the clipper,
800	 * you have to MI_FLUSH to get the pipeline idle.
801	 *
802	 * However, the kernel flushes the pipeline between batches,
803	 * so we should be safe....
804	 *
805	 * On the other hand, after using BLT we must use a non-pipelined
806	 * operation...
807	 */
808	if (sna->kgem.nreloc)
809		OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
810
811	OUT_BATCH(GEN5_PIPELINE_SELECT | PIPELINE_SELECT_3D);
812
813	gen5_emit_state_base_address(sna);
814
815	sna->render_state.gen5.needs_invariant = false;
816}
817
818static void
819gen5_get_batch(struct sna *sna, const struct sna_composite_op *op)
820{
821	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
822
823	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 4)) {
824		DBG(("%s: flushing batch: %d < %d+%d\n",
825		     __FUNCTION__, sna->kgem.surface - sna->kgem.nbatch,
826		     150, 4*8));
827		kgem_submit(&sna->kgem);
828		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
829	}
830
831	if (sna->render_state.gen5.needs_invariant)
832		gen5_emit_invariant(sna);
833}
834
835static void
836gen5_align_vertex(struct sna *sna, const struct sna_composite_op *op)
837{
838	assert(op->floats_per_rect == 3*op->floats_per_vertex);
839	if (op->floats_per_vertex != sna->render_state.gen5.floats_per_vertex) {
840		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
841		     sna->render_state.gen5.floats_per_vertex,
842		     op->floats_per_vertex));
843		gen4_vertex_align(sna, op);
844		sna->render_state.gen5.floats_per_vertex = op->floats_per_vertex;
845	}
846}
847
848static void
849gen5_emit_binding_table(struct sna *sna, uint16_t offset)
850{
851	if (!DBG_NO_STATE_CACHE &&
852	    sna->render_state.gen5.surface_table == offset)
853		return;
854
855	sna->render_state.gen5.surface_table = offset;
856
857	/* Binding table pointers */
858	OUT_BATCH(GEN5_3DSTATE_BINDING_TABLE_POINTERS | 4);
859	OUT_BATCH(0);		/* vs */
860	OUT_BATCH(0);		/* gs */
861	OUT_BATCH(0);		/* clip */
862	OUT_BATCH(0);		/* sf */
863	/* Only the PS uses the binding table */
864	OUT_BATCH(offset*4);
865}
866
867static bool
868gen5_emit_pipelined_pointers(struct sna *sna,
869			     const struct sna_composite_op *op,
870			     int blend, int kernel)
871{
872	uint16_t sp, bp;
873	uint32_t key;
874
875	DBG(("%s: has_mask=%d, src=(%d, %d), mask=(%d, %d),kernel=%d, blend=%d, ca=%d, format=%x\n",
876	     __FUNCTION__, op->u.gen5.ve_id & 2,
877	     op->src.filter, op->src.repeat,
878	     op->mask.filter, op->mask.repeat,
879	     kernel, blend, op->has_component_alpha, (int)op->dst.format));
880
881	sp = SAMPLER_OFFSET(op->src.filter, op->src.repeat,
882			    op->mask.filter, op->mask.repeat,
883			    kernel);
884	bp = gen5_get_blend(blend, op->has_component_alpha, op->dst.format);
885
886	key = sp | (uint32_t)bp << 16 | (op->mask.bo != NULL) << 31;
887	DBG(("%s: sp=%d, bp=%d, key=%08x (current sp=%d, bp=%d, key=%08x)\n",
888	     __FUNCTION__, sp, bp, key,
889	     sna->render_state.gen5.last_pipelined_pointers & 0xffff,
890	     (sna->render_state.gen5.last_pipelined_pointers >> 16) & 0x7fff,
891	     sna->render_state.gen5.last_pipelined_pointers));
892	if (key == sna->render_state.gen5.last_pipelined_pointers)
893		return false;
894
895	OUT_BATCH(GEN5_3DSTATE_PIPELINED_POINTERS | 5);
896	OUT_BATCH(sna->render_state.gen5.vs);
897	OUT_BATCH(GEN5_GS_DISABLE); /* passthrough */
898	OUT_BATCH(GEN5_CLIP_DISABLE); /* passthrough */
899	OUT_BATCH(sna->render_state.gen5.sf[op->mask.bo != NULL]);
900	OUT_BATCH(sna->render_state.gen5.wm + sp);
901	OUT_BATCH(sna->render_state.gen5.cc + bp);
902
903	bp = (sna->render_state.gen5.last_pipelined_pointers & 0x7fff0000) != ((uint32_t)bp << 16);
904	sna->render_state.gen5.last_pipelined_pointers = key;
905
906	gen5_emit_urb(sna);
907
908	return bp;
909}
910
911static bool
912gen5_emit_drawing_rectangle(struct sna *sna, const struct sna_composite_op *op)
913{
914	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
915	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
916
917	assert(!too_large(abs(op->dst.x), abs(op->dst.y)));
918	assert(!too_large(op->dst.width, op->dst.height));
919
920	if (!DBG_NO_STATE_CACHE &&
921	    sna->render_state.gen5.drawrect_limit == limit &&
922	    sna->render_state.gen5.drawrect_offset == offset)
923		return false;
924
925	sna->render_state.gen5.drawrect_offset = offset;
926	sna->render_state.gen5.drawrect_limit = limit;
927
928	OUT_BATCH(GEN5_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
929	OUT_BATCH(0x00000000);
930	OUT_BATCH(limit);
931	OUT_BATCH(offset);
932	return true;
933}
934
935static void
936gen5_emit_vertex_elements(struct sna *sna,
937			  const struct sna_composite_op *op)
938{
939	/*
940	 * vertex data in vertex buffer
941	 *    position: (x, y)
942	 *    texture coordinate 0: (u0, v0) if (is_affine is true) else (u0, v0, w0)
943	 *    texture coordinate 1 if (has_mask is true): same as above
944	 */
945	struct gen5_render_state *render = &sna->render_state.gen5;
946	int id = op->u.gen5.ve_id;
947	bool has_mask = id >> 2;
948	uint32_t format, dw;
949
950	if (!DBG_NO_STATE_CACHE && render->ve_id == id)
951		return;
952
953	DBG(("%s: changing %d -> %d\n", __FUNCTION__, render->ve_id, id));
954	render->ve_id = id;
955
956	/* The VUE layout
957	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
958	 *    dword 4-7: position (x, y, 1.0, 1.0),
959	 *    dword 8-11: texture coordinate 0 (u0, v0, w0, 1.0)
960	 *    dword 12-15: texture coordinate 1 (u1, v1, w1, 1.0)
961	 *
962	 * dword 4-15 are fetched from vertex buffer
963	 */
964	OUT_BATCH(GEN5_3DSTATE_VERTEX_ELEMENTS |
965		((2 * (has_mask ? 4 : 3)) + 1 - 2));
966
967	OUT_BATCH((id << VE0_VERTEX_BUFFER_INDEX_SHIFT) | VE0_VALID |
968		  (GEN5_SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT) |
969		  (0 << VE0_OFFSET_SHIFT));
970	OUT_BATCH((VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT) |
971		  (VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT) |
972		  (VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT) |
973		  (VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT));
974
975	/* x,y */
976	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
977		  GEN5_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
978		  0 << VE0_OFFSET_SHIFT);
979	OUT_BATCH(VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
980		  VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
981		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
982		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
983
984	/* u0, v0, w0 */
985	DBG(("%s: id=%d, first channel %d floats, offset=4b\n", __FUNCTION__,
986	     id, id & 3));
987	dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
988	switch (id & 3) {
989	default:
990		assert(0);
991	case 0:
992		format = GEN5_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT;
993		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
994		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
995		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
996		break;
997	case 1:
998		format = GEN5_SURFACEFORMAT_R32_FLOAT << VE0_FORMAT_SHIFT;
999		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1000		dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
1001		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1002		break;
1003	case 2:
1004		format = GEN5_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT;
1005		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1006		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1007		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1008		break;
1009	case 3:
1010		format = GEN5_SURFACEFORMAT_R32G32B32_FLOAT << VE0_FORMAT_SHIFT;
1011		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1012		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1013		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
1014		break;
1015	}
1016	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1017		  format | 4 << VE0_OFFSET_SHIFT);
1018	OUT_BATCH(dw);
1019
1020	/* u1, v1, w1 */
1021	if (has_mask) {
1022		unsigned offset = 4 + ((id & 3) ?: 1) * sizeof(float);
1023		DBG(("%s: id=%x, second channel %d floats, offset=%db\n", __FUNCTION__,
1024		     id, id >> 2, offset));
1025		dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
1026		switch (id >> 2) {
1027		case 1:
1028			format = GEN5_SURFACEFORMAT_R32_FLOAT << VE0_FORMAT_SHIFT;
1029			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1030			dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
1031			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1032			break;
1033		default:
1034			assert(0);
1035		case 2:
1036			format = GEN5_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT;
1037			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1038			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1039			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1040			break;
1041		case 3:
1042			format = GEN5_SURFACEFORMAT_R32G32B32_FLOAT << VE0_FORMAT_SHIFT;
1043			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1044			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1045			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
1046			break;
1047		}
1048		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1049			  format | offset << VE0_OFFSET_SHIFT);
1050		OUT_BATCH(dw);
1051	}
1052}
1053
1054inline static void
1055gen5_emit_pipe_flush(struct sna *sna)
1056{
1057#if 1
1058	OUT_BATCH(GEN5_PIPE_CONTROL |
1059		  GEN5_PIPE_CONTROL_WC_FLUSH |
1060		  (4 - 2));
1061	OUT_BATCH(0);
1062	OUT_BATCH(0);
1063	OUT_BATCH(0);
1064#else
1065	OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
1066#endif
1067}
1068
1069static void
1070gen5_emit_state(struct sna *sna,
1071		const struct sna_composite_op *op,
1072		uint16_t offset)
1073{
1074	bool flush = false;
1075
1076	assert(op->dst.bo->exec);
1077
1078	/* drawrect must be first for Ironlake BLT workaround */
1079	if (gen5_emit_drawing_rectangle(sna, op))
1080		offset &= ~1;
1081	gen5_emit_binding_table(sna, offset & ~1);
1082	if (gen5_emit_pipelined_pointers(sna, op, op->op, op->u.gen5.wm_kernel)){
1083		DBG(("%s: changed blend state, flush required? %d\n",
1084		     __FUNCTION__, (offset & 1) && op->op > PictOpSrc));
1085		flush = (offset & 1) && op->op > PictOpSrc;
1086	}
1087	gen5_emit_vertex_elements(sna, op);
1088
1089	if (ALWAYS_FLUSH || kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
1090		DBG(("%s: flushing dirty (%d, %d)\n", __FUNCTION__,
1091		     kgem_bo_is_dirty(op->src.bo),
1092		     kgem_bo_is_dirty(op->mask.bo)));
1093		OUT_BATCH(MI_FLUSH);
1094		kgem_clear_dirty(&sna->kgem);
1095		kgem_bo_mark_dirty(op->dst.bo);
1096		flush = false;
1097	}
1098	if (flush) {
1099		DBG(("%s: forcing flush\n", __FUNCTION__));
1100		gen5_emit_pipe_flush(sna);
1101	}
1102}
1103
1104static void gen5_bind_surfaces(struct sna *sna,
1105			       const struct sna_composite_op *op)
1106{
1107	bool dirty = kgem_bo_is_dirty(op->dst.bo);
1108	uint32_t *binding_table;
1109	uint16_t offset;
1110
1111	gen5_get_batch(sna, op);
1112
1113	binding_table = gen5_composite_get_binding_table(sna, &offset);
1114
1115	binding_table[0] =
1116		gen5_bind_bo(sna,
1117			    op->dst.bo, op->dst.width, op->dst.height,
1118			    gen5_get_dest_format(op->dst.format),
1119			    true);
1120	binding_table[1] =
1121		gen5_bind_bo(sna,
1122			     op->src.bo, op->src.width, op->src.height,
1123			     op->src.card_format,
1124			     false);
1125	if (op->mask.bo) {
1126		assert(op->u.gen5.ve_id >> 2);
1127		binding_table[2] =
1128			gen5_bind_bo(sna,
1129				     op->mask.bo,
1130				     op->mask.width,
1131				     op->mask.height,
1132				     op->mask.card_format,
1133				     false);
1134	}
1135
1136	if (sna->kgem.surface == offset &&
1137	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen5.surface_table) == *(uint64_t*)binding_table &&
1138	    (op->mask.bo == NULL ||
1139	     sna->kgem.batch[sna->render_state.gen5.surface_table+2] == binding_table[2])) {
1140		sna->kgem.surface += sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
1141		offset = sna->render_state.gen5.surface_table;
1142	}
1143
1144	gen5_emit_state(sna, op, offset | dirty);
1145}
1146
1147fastcall static void
1148gen5_render_composite_blt(struct sna *sna,
1149			  const struct sna_composite_op *op,
1150			  const struct sna_composite_rectangles *r)
1151{
1152	DBG(("%s: src=(%d, %d)+(%d, %d), mask=(%d, %d)+(%d, %d), dst=(%d, %d)+(%d, %d), size=(%d, %d)\n",
1153	     __FUNCTION__,
1154	     r->src.x, r->src.y, op->src.offset[0], op->src.offset[1],
1155	     r->mask.x, r->mask.y, op->mask.offset[0], op->mask.offset[1],
1156	     r->dst.x, r->dst.y, op->dst.x, op->dst.y,
1157	     r->width, r->height));
1158
1159	gen5_get_rectangles(sna, op, 1, gen5_bind_surfaces);
1160	op->prim_emit(sna, op, r);
1161}
1162
1163fastcall static void
1164gen5_render_composite_box(struct sna *sna,
1165			  const struct sna_composite_op *op,
1166			  const BoxRec *box)
1167{
1168	struct sna_composite_rectangles r;
1169
1170	DBG(("  %s: (%d, %d), (%d, %d)\n",
1171	     __FUNCTION__,
1172	     box->x1, box->y1, box->x2, box->y2));
1173
1174	gen5_get_rectangles(sna, op, 1, gen5_bind_surfaces);
1175
1176	r.dst.x = box->x1;
1177	r.dst.y = box->y1;
1178	r.width  = box->x2 - box->x1;
1179	r.height = box->y2 - box->y1;
1180	r.mask = r.src = r.dst;
1181
1182	op->prim_emit(sna, op, &r);
1183}
1184
1185static void
1186gen5_render_composite_boxes__blt(struct sna *sna,
1187				 const struct sna_composite_op *op,
1188				 const BoxRec *box, int nbox)
1189{
1190	DBG(("%s(%d) delta=(%d, %d), src=(%d, %d)/(%d, %d), mask=(%d, %d)/(%d, %d)\n",
1191	     __FUNCTION__, nbox, op->dst.x, op->dst.y,
1192	     op->src.offset[0], op->src.offset[1],
1193	     op->src.width, op->src.height,
1194	     op->mask.offset[0], op->mask.offset[1],
1195	     op->mask.width, op->mask.height));
1196
1197	do {
1198		int nbox_this_time;
1199
1200		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
1201						     gen5_bind_surfaces);
1202		nbox -= nbox_this_time;
1203
1204		do {
1205			struct sna_composite_rectangles r;
1206
1207			DBG(("  %s: (%d, %d), (%d, %d)\n",
1208			     __FUNCTION__,
1209			     box->x1, box->y1, box->x2, box->y2));
1210
1211			r.dst.x = box->x1;
1212			r.dst.y = box->y1;
1213			r.width  = box->x2 - box->x1;
1214			r.height = box->y2 - box->y1;
1215			r.mask = r.src = r.dst;
1216			op->prim_emit(sna, op, &r);
1217			box++;
1218		} while (--nbox_this_time);
1219	} while (nbox);
1220}
1221
1222static void
1223gen5_render_composite_boxes(struct sna *sna,
1224			    const struct sna_composite_op *op,
1225			    const BoxRec *box, int nbox)
1226{
1227	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1228
1229	do {
1230		int nbox_this_time;
1231		float *v;
1232
1233		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
1234						     gen5_bind_surfaces);
1235		assert(nbox_this_time);
1236		nbox -= nbox_this_time;
1237
1238		v = sna->render.vertices + sna->render.vertex_used;
1239		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1240
1241		op->emit_boxes(op, box, nbox_this_time, v);
1242		box += nbox_this_time;
1243	} while (nbox);
1244}
1245
1246static void
1247gen5_render_composite_boxes__thread(struct sna *sna,
1248				    const struct sna_composite_op *op,
1249				    const BoxRec *box, int nbox)
1250{
1251	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1252
1253	sna_vertex_lock(&sna->render);
1254	do {
1255		int nbox_this_time;
1256		float *v;
1257
1258		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
1259						     gen5_bind_surfaces);
1260		assert(nbox_this_time);
1261		nbox -= nbox_this_time;
1262
1263		v = sna->render.vertices + sna->render.vertex_used;
1264		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1265
1266		sna_vertex_acquire__locked(&sna->render);
1267		sna_vertex_unlock(&sna->render);
1268
1269		op->emit_boxes(op, box, nbox_this_time, v);
1270		box += nbox_this_time;
1271
1272		sna_vertex_lock(&sna->render);
1273		sna_vertex_release__locked(&sna->render);
1274	} while (nbox);
1275	sna_vertex_unlock(&sna->render);
1276}
1277
1278#ifndef MAX
1279#define MAX(a,b) ((a) > (b) ? (a) : (b))
1280#endif
1281
1282static uint32_t gen5_bind_video_source(struct sna *sna,
1283				       struct kgem_bo *src_bo,
1284				       uint32_t src_offset,
1285				       int src_width,
1286				       int src_height,
1287				       int src_pitch,
1288				       uint32_t src_surf_format)
1289{
1290	struct gen5_surface_state *ss;
1291
1292	sna->kgem.surface -= sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
1293
1294	ss = memset(sna->kgem.batch + sna->kgem.surface, 0, sizeof(*ss));
1295	ss->ss0.surface_type = GEN5_SURFACE_2D;
1296	ss->ss0.surface_format = src_surf_format;
1297	ss->ss0.color_blend = 1;
1298
1299	ss->ss1.base_addr =
1300		kgem_add_reloc(&sna->kgem,
1301			       sna->kgem.surface + 1,
1302			       src_bo,
1303			       I915_GEM_DOMAIN_SAMPLER << 16,
1304			       src_offset);
1305
1306	ss->ss2.width  = src_width - 1;
1307	ss->ss2.height = src_height - 1;
1308	ss->ss3.pitch  = src_pitch - 1;
1309
1310	return sna->kgem.surface * sizeof(uint32_t);
1311}
1312
1313static void gen5_video_bind_surfaces(struct sna *sna,
1314				     const struct sna_composite_op *op)
1315{
1316	bool dirty = kgem_bo_is_dirty(op->dst.bo);
1317	struct sna_video_frame *frame = op->priv;
1318	uint32_t src_surf_format[6];
1319	uint32_t src_surf_base[6];
1320	int src_width[6];
1321	int src_height[6];
1322	int src_pitch[6];
1323	uint32_t *binding_table;
1324	uint16_t offset;
1325	int n_src, n;
1326
1327	src_surf_base[0] = 0;
1328	src_surf_base[1] = 0;
1329	src_surf_base[2] = frame->VBufOffset;
1330	src_surf_base[3] = frame->VBufOffset;
1331	src_surf_base[4] = frame->UBufOffset;
1332	src_surf_base[5] = frame->UBufOffset;
1333
1334	if (is_planar_fourcc(frame->id)) {
1335		for (n = 0; n < 2; n++) {
1336			src_surf_format[n] = GEN5_SURFACEFORMAT_R8_UNORM;
1337			src_width[n]  = frame->width;
1338			src_height[n] = frame->height;
1339			src_pitch[n]  = frame->pitch[1];
1340		}
1341		for (; n < 6; n++) {
1342			if (is_nv12_fourcc(frame->id))
1343				src_surf_format[n] = GEN5_SURFACEFORMAT_R8G8_UNORM;
1344			else
1345				src_surf_format[n] = GEN5_SURFACEFORMAT_R8_UNORM;
1346			src_width[n]  = frame->width / 2;
1347			src_height[n] = frame->height / 2;
1348			src_pitch[n]  = frame->pitch[0];
1349		}
1350		n_src = 6;
1351	} else {
1352		if (frame->id == FOURCC_UYVY)
1353			src_surf_format[0] = GEN5_SURFACEFORMAT_YCRCB_SWAPY;
1354		else
1355			src_surf_format[0] = GEN5_SURFACEFORMAT_YCRCB_NORMAL;
1356
1357		src_width[0]  = frame->width;
1358		src_height[0] = frame->height;
1359		src_pitch[0]  = frame->pitch[0];
1360		n_src = 1;
1361	}
1362
1363	gen5_get_batch(sna, op);
1364
1365	binding_table = gen5_composite_get_binding_table(sna, &offset);
1366	binding_table[0] =
1367		gen5_bind_bo(sna,
1368			     op->dst.bo, op->dst.width, op->dst.height,
1369			     gen5_get_dest_format(op->dst.format),
1370			     true);
1371	for (n = 0; n < n_src; n++) {
1372		binding_table[1+n] =
1373			gen5_bind_video_source(sna,
1374					       frame->bo,
1375					       src_surf_base[n],
1376					       src_width[n],
1377					       src_height[n],
1378					       src_pitch[n],
1379					       src_surf_format[n]);
1380	}
1381
1382	gen5_emit_state(sna, op, offset | dirty);
1383}
1384
1385static unsigned select_video_kernel(const struct sna_video *video,
1386				    const struct sna_video_frame *frame)
1387{
1388	switch (frame->id) {
1389	case FOURCC_YV12:
1390	case FOURCC_I420:
1391	case FOURCC_XVMC:
1392		return video->colorspace ?
1393			WM_KERNEL_VIDEO_PLANAR_BT709 :
1394			WM_KERNEL_VIDEO_PLANAR_BT601;
1395
1396	case FOURCC_NV12:
1397		return video->colorspace ?
1398			WM_KERNEL_VIDEO_NV12_BT709 :
1399			WM_KERNEL_VIDEO_NV12_BT601;
1400
1401	default:
1402		return video->colorspace ?
1403			WM_KERNEL_VIDEO_PACKED_BT709 :
1404			WM_KERNEL_VIDEO_PACKED_BT601;
1405	}
1406}
1407
1408static bool
1409gen5_render_video(struct sna *sna,
1410		  struct sna_video *video,
1411		  struct sna_video_frame *frame,
1412		  RegionPtr dstRegion,
1413		  PixmapPtr pixmap)
1414{
1415	struct sna_composite_op tmp;
1416	struct sna_pixmap *priv = sna_pixmap(pixmap);
1417	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
1418	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
1419	int src_width = frame->src.x2 - frame->src.x1;
1420	int src_height = frame->src.y2 - frame->src.y1;
1421	float src_offset_x, src_offset_y;
1422	float src_scale_x, src_scale_y;
1423	const BoxRec *box;
1424	int nbox;
1425
1426	DBG(("%s: %dx%d -> %dx%d\n", __FUNCTION__,
1427	     src_width, src_height, dst_width, dst_height));
1428
1429	assert(priv->gpu_bo);
1430	memset(&tmp, 0, sizeof(tmp));
1431
1432	tmp.op = PictOpSrc;
1433	tmp.dst.pixmap = pixmap;
1434	tmp.dst.width  = pixmap->drawable.width;
1435	tmp.dst.height = pixmap->drawable.height;
1436	tmp.dst.format = sna_format_for_depth(pixmap->drawable.depth);
1437	tmp.dst.bo = priv->gpu_bo;
1438
1439	if (src_width == dst_width && src_height == dst_height)
1440		tmp.src.filter = SAMPLER_FILTER_NEAREST;
1441	else
1442		tmp.src.filter = SAMPLER_FILTER_BILINEAR;
1443	tmp.src.repeat = SAMPLER_EXTEND_PAD;
1444	tmp.src.bo = frame->bo;
1445	tmp.mask.bo = NULL;
1446	tmp.u.gen5.wm_kernel = select_video_kernel(video, frame);
1447	tmp.u.gen5.ve_id = 2;
1448	tmp.is_affine = true;
1449	tmp.floats_per_vertex = 3;
1450	tmp.floats_per_rect = 9;
1451	tmp.priv = frame;
1452
1453	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
1454		kgem_submit(&sna->kgem);
1455		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
1456			return false;
1457	}
1458
1459	gen5_align_vertex(sna, &tmp);
1460	gen5_video_bind_surfaces(sna, &tmp);
1461
1462	src_scale_x = (float)src_width / dst_width / frame->width;
1463	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
1464
1465	src_scale_y = (float)src_height / dst_height / frame->height;
1466	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
1467
1468	box = region_rects(dstRegion);
1469	nbox = region_num_rects(dstRegion);
1470	while (nbox--) {
1471		gen5_get_rectangles(sna, &tmp, 1, gen5_video_bind_surfaces);
1472
1473		OUT_VERTEX(box->x2, box->y2);
1474		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
1475		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
1476
1477		OUT_VERTEX(box->x1, box->y2);
1478		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
1479		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
1480
1481		OUT_VERTEX(box->x1, box->y1);
1482		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
1483		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
1484
1485		box++;
1486	}
1487	gen4_vertex_flush(sna);
1488
1489	if (!DAMAGE_IS_ALL(priv->gpu_damage))
1490		sna_damage_add(&priv->gpu_damage, dstRegion);
1491
1492	return true;
1493}
1494
1495static int
1496gen5_composite_picture(struct sna *sna,
1497		       PicturePtr picture,
1498		       struct sna_composite_channel *channel,
1499		       int x, int y,
1500		       int w, int h,
1501		       int dst_x, int dst_y,
1502		       bool precise)
1503{
1504	PixmapPtr pixmap;
1505	uint32_t color;
1506	int16_t dx, dy;
1507
1508	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
1509	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
1510
1511	channel->is_solid = false;
1512	channel->card_format = -1;
1513
1514	if (sna_picture_is_solid(picture, &color))
1515		return gen4_channel_init_solid(sna, channel, color);
1516
1517	if (picture->pDrawable == NULL) {
1518		int ret;
1519
1520		if (picture->pSourcePict->type == SourcePictTypeLinear)
1521			return gen4_channel_init_linear(sna, picture, channel,
1522							x, y,
1523							w, h,
1524							dst_x, dst_y);
1525
1526		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
1527		ret = -1;
1528		if (!precise)
1529			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
1530								      x, y, w, h, dst_x, dst_y);
1531		if (ret == -1)
1532			ret = sna_render_picture_fixup(sna, picture, channel,
1533						       x, y, w, h, dst_x, dst_y);
1534		return ret;
1535	}
1536
1537	if (picture->alphaMap) {
1538		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
1539		return sna_render_picture_fixup(sna, picture, channel,
1540						x, y, w, h, dst_x, dst_y);
1541	}
1542
1543	if (!gen5_check_repeat(picture))
1544		return sna_render_picture_fixup(sna, picture, channel,
1545						x, y, w, h, dst_x, dst_y);
1546
1547	if (!gen5_check_filter(picture))
1548		return sna_render_picture_fixup(sna, picture, channel,
1549						x, y, w, h, dst_x, dst_y);
1550
1551	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
1552	channel->filter = picture->filter;
1553
1554	pixmap = get_drawable_pixmap(picture->pDrawable);
1555	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
1556
1557	x += dx + picture->pDrawable->x;
1558	y += dy + picture->pDrawable->y;
1559
1560	channel->is_affine = sna_transform_is_affine(picture->transform);
1561	if (sna_transform_is_imprecise_integer_translation(picture->transform, picture->filter, precise, &dx, &dy)) {
1562		DBG(("%s: integer translation (%d, %d), removing\n",
1563		     __FUNCTION__, dx, dy));
1564		x += dx;
1565		y += dy;
1566		channel->transform = NULL;
1567		channel->filter = PictFilterNearest;
1568
1569		if (channel->repeat ||
1570		    (x >= 0 &&
1571		     y >= 0 &&
1572		     x + w <= pixmap->drawable.width &&
1573		     y + h <= pixmap->drawable.height)) {
1574			struct sna_pixmap *priv = sna_pixmap(pixmap);
1575			if (priv && priv->clear) {
1576				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
1577				return gen4_channel_init_solid(sna, channel, solid_color(picture->format, priv->clear_color));
1578			}
1579		}
1580	} else
1581		channel->transform = picture->transform;
1582
1583	channel->pict_format = picture->format;
1584	channel->card_format = gen5_get_card_format(picture->format);
1585	if (channel->card_format == -1)
1586		return sna_render_picture_convert(sna, picture, channel, pixmap,
1587						  x, y, w, h, dst_x, dst_y,
1588						  false);
1589
1590	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
1591		return sna_render_picture_extract(sna, picture, channel,
1592						  x, y, w, h, dst_x, dst_y);
1593
1594	DBG(("%s: pixmap, repeat=%d, filter=%d, transform?=%d [affine? %d], format=%08x\n",
1595	     __FUNCTION__,
1596	     channel->repeat, channel->filter,
1597	     channel->transform != NULL, channel->is_affine,
1598	     channel->pict_format));
1599	if (channel->transform) {
1600		DBG(("%s: transform=[%f %f %f, %f %f %f, %f %f %f]\n",
1601		     __FUNCTION__,
1602		     channel->transform->matrix[0][0] / 65536.,
1603		     channel->transform->matrix[0][1] / 65536.,
1604		     channel->transform->matrix[0][2] / 65536.,
1605		     channel->transform->matrix[1][0] / 65536.,
1606		     channel->transform->matrix[1][1] / 65536.,
1607		     channel->transform->matrix[1][2] / 65536.,
1608		     channel->transform->matrix[2][0] / 65536.,
1609		     channel->transform->matrix[2][1] / 65536.,
1610		     channel->transform->matrix[2][2] / 65536.));
1611	}
1612
1613	return sna_render_pixmap_bo(sna, channel, pixmap,
1614				    x, y, w, h, dst_x, dst_y);
1615}
1616
1617static void gen5_composite_channel_convert(struct sna_composite_channel *channel)
1618{
1619	channel->repeat = gen5_repeat(channel->repeat);
1620	channel->filter = gen5_filter(channel->filter);
1621	if (channel->card_format == (unsigned)-1)
1622		channel->card_format = gen5_get_card_format(channel->pict_format);
1623}
1624
1625static void
1626gen5_render_composite_done(struct sna *sna,
1627			   const struct sna_composite_op *op)
1628{
1629	if (sna->render.vertex_offset) {
1630		gen4_vertex_flush(sna);
1631		gen5_magic_ca_pass(sna,op);
1632	}
1633
1634	DBG(("%s()\n", __FUNCTION__));
1635
1636	if (op->mask.bo)
1637		kgem_bo_destroy(&sna->kgem, op->mask.bo);
1638	if (op->src.bo)
1639		kgem_bo_destroy(&sna->kgem, op->src.bo);
1640
1641	sna_render_composite_redirect_done(sna, op);
1642}
1643
1644static bool
1645gen5_composite_set_target(struct sna *sna,
1646			  struct sna_composite_op *op,
1647			  PicturePtr dst,
1648			  int x, int y, int w, int h,
1649			  bool partial)
1650{
1651	BoxRec box;
1652	unsigned hint;
1653
1654	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
1655	op->dst.width  = op->dst.pixmap->drawable.width;
1656	op->dst.height = op->dst.pixmap->drawable.height;
1657	op->dst.format = dst->format;
1658	if (w && h) {
1659		box.x1 = x;
1660		box.y1 = y;
1661		box.x2 = x + w;
1662		box.y2 = y + h;
1663	} else
1664		sna_render_picture_extents(dst, &box);
1665
1666	hint = PREFER_GPU | RENDER_GPU;
1667	if (!need_tiling(sna, op->dst.width, op->dst.height))
1668		hint |= FORCE_GPU;
1669	if (!partial) {
1670		hint |= IGNORE_DAMAGE;
1671		if (w == op->dst.width && h == op->dst.height)
1672			hint |= REPLACES;
1673	}
1674
1675	op->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint, &box, &op->damage);
1676	if (op->dst.bo == NULL)
1677		return false;
1678
1679	if (hint & REPLACES) {
1680		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
1681		kgem_bo_pair_undo(&sna->kgem, priv->gpu_bo, priv->cpu_bo);
1682	}
1683
1684	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
1685			    &op->dst.x, &op->dst.y);
1686
1687	DBG(("%s: pixmap=%ld, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
1688	     __FUNCTION__,
1689	     op->dst.pixmap->drawable.serialNumber, (int)op->dst.format,
1690	     op->dst.width, op->dst.height,
1691	     op->dst.bo->pitch,
1692	     op->dst.x, op->dst.y,
1693	     op->damage ? *op->damage : (void *)-1));
1694
1695	assert(op->dst.bo->proxy == NULL);
1696
1697	if (too_large(op->dst.width, op->dst.height) &&
1698	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
1699		return false;
1700
1701	return true;
1702}
1703
1704static bool
1705is_gradient(PicturePtr picture, bool precise)
1706{
1707	if (picture->pDrawable)
1708		return false;
1709
1710	switch (picture->pSourcePict->type) {
1711	case SourcePictTypeSolidFill:
1712	case SourcePictTypeLinear:
1713		return false;
1714	default:
1715		return precise;
1716	}
1717}
1718
1719static bool
1720has_alphamap(PicturePtr p)
1721{
1722	return p->alphaMap != NULL;
1723}
1724
1725static bool
1726need_upload(struct sna *sna, PicturePtr p)
1727{
1728	return p->pDrawable && untransformed(p) &&
1729		!is_gpu(sna, p->pDrawable, PREFER_GPU_RENDER);
1730}
1731
1732static bool
1733source_is_busy(PixmapPtr pixmap)
1734{
1735	struct sna_pixmap *priv = sna_pixmap(pixmap);
1736	if (priv == NULL)
1737		return false;
1738
1739	if (priv->clear)
1740		return false;
1741
1742	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
1743		return true;
1744
1745	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
1746		return true;
1747
1748	return priv->gpu_damage && !priv->cpu_damage;
1749}
1750
1751static bool
1752source_fallback(struct sna *sna, PicturePtr p, PixmapPtr pixmap, bool precise)
1753{
1754	if (sna_picture_is_solid(p, NULL))
1755		return false;
1756
1757	if (is_gradient(p, precise) ||
1758	    !gen5_check_repeat(p) ||
1759	    !gen5_check_format(p->format))
1760		return true;
1761
1762	if (pixmap && source_is_busy(pixmap))
1763		return false;
1764
1765	return has_alphamap(p) || !gen5_check_filter(p) || need_upload(sna, p);
1766}
1767
1768static bool
1769gen5_composite_fallback(struct sna *sna,
1770			PicturePtr src,
1771			PicturePtr mask,
1772			PicturePtr dst)
1773{
1774	PixmapPtr src_pixmap;
1775	PixmapPtr mask_pixmap;
1776	PixmapPtr dst_pixmap;
1777	bool src_fallback, mask_fallback;
1778
1779	if (!gen5_check_dst_format(dst->format)) {
1780		DBG(("%s: unknown destination format: %d\n",
1781		     __FUNCTION__, dst->format));
1782		return true;
1783	}
1784
1785	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
1786
1787	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
1788	src_fallback = source_fallback(sna, src, src_pixmap,
1789				       dst->polyMode == PolyModePrecise);
1790
1791	if (mask) {
1792		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
1793		mask_fallback = source_fallback(sna, mask, mask_pixmap,
1794						dst->polyMode == PolyModePrecise);
1795	} else {
1796		mask_pixmap = NULL;
1797		mask_fallback = false;
1798	}
1799
1800	/* If we are using the destination as a source and need to
1801	 * readback in order to upload the source, do it all
1802	 * on the cpu.
1803	 */
1804	if (src_pixmap == dst_pixmap && src_fallback) {
1805		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
1806		return true;
1807	}
1808	if (mask_pixmap == dst_pixmap && mask_fallback) {
1809		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
1810		return true;
1811	}
1812
1813	/* If anything is on the GPU, push everything out to the GPU */
1814	if (dst_use_gpu(dst_pixmap)) {
1815		DBG(("%s: dst is already on the GPU, try to use GPU\n",
1816		     __FUNCTION__));
1817		return false;
1818	}
1819
1820	if (src_pixmap && !src_fallback) {
1821		DBG(("%s: src is already on the GPU, try to use GPU\n",
1822		     __FUNCTION__));
1823		return false;
1824	}
1825	if (mask_pixmap && !mask_fallback) {
1826		DBG(("%s: mask is already on the GPU, try to use GPU\n",
1827		     __FUNCTION__));
1828		return false;
1829	}
1830
1831	/* However if the dst is not on the GPU and we need to
1832	 * render one of the sources using the CPU, we may
1833	 * as well do the entire operation in place onthe CPU.
1834	 */
1835	if (src_fallback) {
1836		DBG(("%s: dst is on the CPU and src will fallback\n",
1837		     __FUNCTION__));
1838		return true;
1839	}
1840
1841	if (mask_fallback) {
1842		DBG(("%s: dst is on the CPU and mask will fallback\n",
1843		     __FUNCTION__));
1844		return true;
1845	}
1846
1847	if (too_large(dst_pixmap->drawable.width,
1848		      dst_pixmap->drawable.height) &&
1849	    dst_is_cpu(dst_pixmap)) {
1850		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
1851		return true;
1852	}
1853
1854	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
1855	     __FUNCTION__));
1856	return dst_use_cpu(dst_pixmap);
1857}
1858
1859static int
1860reuse_source(struct sna *sna,
1861	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
1862	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
1863{
1864	uint32_t color;
1865
1866	if (src_x != msk_x || src_y != msk_y)
1867		return false;
1868
1869	if (src == mask) {
1870		DBG(("%s: mask is source\n", __FUNCTION__));
1871		*mc = *sc;
1872		mc->bo = kgem_bo_reference(mc->bo);
1873		return true;
1874	}
1875
1876	if (sna_picture_is_solid(mask, &color))
1877		return gen4_channel_init_solid(sna, mc, color);
1878
1879	if (sc->is_solid)
1880		return false;
1881
1882	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
1883		return false;
1884
1885	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
1886
1887	if (!sna_transform_equal(src->transform, mask->transform))
1888		return false;
1889
1890	if (!sna_picture_alphamap_equal(src, mask))
1891		return false;
1892
1893	if (!gen5_check_repeat(mask))
1894		return false;
1895
1896	if (!gen5_check_filter(mask))
1897		return false;
1898
1899	if (!gen5_check_format(mask->format))
1900		return false;
1901
1902	DBG(("%s: reusing source channel for mask with a twist\n",
1903	     __FUNCTION__));
1904
1905	*mc = *sc;
1906	mc->repeat = gen5_repeat(mask->repeat ? mask->repeatType : RepeatNone);
1907	mc->filter = gen5_filter(mask->filter);
1908	mc->pict_format = mask->format;
1909	mc->card_format = gen5_get_card_format(mask->format);
1910	mc->bo = kgem_bo_reference(mc->bo);
1911	return true;
1912}
1913
1914static bool
1915gen5_render_composite(struct sna *sna,
1916		      uint8_t op,
1917		      PicturePtr src,
1918		      PicturePtr mask,
1919		      PicturePtr dst,
1920		      int16_t src_x, int16_t src_y,
1921		      int16_t msk_x, int16_t msk_y,
1922		      int16_t dst_x, int16_t dst_y,
1923		      int16_t width, int16_t height,
1924		      unsigned flags,
1925		      struct sna_composite_op *tmp)
1926{
1927	DBG(("%s: %dx%d, current mode=%d\n", __FUNCTION__,
1928	     width, height, sna->kgem.mode));
1929
1930	if (op >= ARRAY_SIZE(gen5_blend_op)) {
1931		DBG(("%s: unhandled blend op %d\n", __FUNCTION__, op));
1932		return false;
1933	}
1934
1935	if (mask == NULL &&
1936	    sna_blt_composite(sna, op,
1937			      src, dst,
1938			      src_x, src_y,
1939			      dst_x, dst_y,
1940			      width, height,
1941			      flags, tmp))
1942		return true;
1943
1944	if (gen5_composite_fallback(sna, src, mask, dst))
1945		goto fallback;
1946
1947	if (need_tiling(sna, width, height))
1948		return sna_tiling_composite(op, src, mask, dst,
1949					    src_x, src_y,
1950					    msk_x, msk_y,
1951					    dst_x, dst_y,
1952					    width, height,
1953					    tmp);
1954
1955	if (!gen5_composite_set_target(sna, tmp, dst,
1956				       dst_x, dst_y, width, height,
1957				       flags & COMPOSITE_PARTIAL || op > PictOpSrc)) {
1958		DBG(("%s: failed to set composite target\n", __FUNCTION__));
1959		goto fallback;
1960	}
1961
1962	DBG(("%s: preparing source\n", __FUNCTION__));
1963	tmp->op = op;
1964	switch (gen5_composite_picture(sna, src, &tmp->src,
1965				       src_x, src_y,
1966				       width, height,
1967				       dst_x, dst_y,
1968				       dst->polyMode == PolyModePrecise)) {
1969	case -1:
1970		DBG(("%s: failed to prepare source picture\n", __FUNCTION__));
1971		goto cleanup_dst;
1972	case 0:
1973		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
1974			goto cleanup_dst;
1975		/* fall through to fixup */
1976	case 1:
1977		if (mask == NULL &&
1978		    sna_blt_composite__convert(sna,
1979					       dst_x, dst_y, width, height,
1980					       tmp))
1981			return true;
1982
1983		gen5_composite_channel_convert(&tmp->src);
1984		break;
1985	}
1986
1987	tmp->is_affine = tmp->src.is_affine;
1988	tmp->has_component_alpha = false;
1989	tmp->need_magic_ca_pass = false;
1990
1991	if (mask) {
1992		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
1993			tmp->has_component_alpha = true;
1994
1995			/* Check if it's component alpha that relies on a source alpha and on
1996			 * the source value.  We can only get one of those into the single
1997			 * source value that we get to blend with.
1998			 */
1999			if (gen5_blend_op[op].src_alpha &&
2000			    (gen5_blend_op[op].src_blend != GEN5_BLENDFACTOR_ZERO)) {
2001				if (op != PictOpOver) {
2002					DBG(("%s: unhandled CA blend op %d\n", __FUNCTION__, op));
2003					goto cleanup_src;
2004				}
2005
2006				tmp->need_magic_ca_pass = true;
2007				tmp->op = PictOpOutReverse;
2008			}
2009		}
2010
2011		if (!reuse_source(sna,
2012				  src, &tmp->src, src_x, src_y,
2013				  mask, &tmp->mask, msk_x, msk_y)) {
2014			DBG(("%s: preparing mask\n", __FUNCTION__));
2015			switch (gen5_composite_picture(sna, mask, &tmp->mask,
2016						       msk_x, msk_y,
2017						       width, height,
2018						       dst_x, dst_y,
2019						       dst->polyMode == PolyModePrecise)) {
2020			case -1:
2021				DBG(("%s: failed to prepare mask picture\n", __FUNCTION__));
2022				goto cleanup_src;
2023			case 0:
2024				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
2025					goto cleanup_src;
2026				/* fall through to fixup */
2027			case 1:
2028				gen5_composite_channel_convert(&tmp->mask);
2029				break;
2030			}
2031		}
2032
2033		tmp->is_affine &= tmp->mask.is_affine;
2034	}
2035
2036	tmp->u.gen5.wm_kernel =
2037		gen5_choose_composite_kernel(tmp->op,
2038					     tmp->mask.bo != NULL,
2039					     tmp->has_component_alpha,
2040					     tmp->is_affine);
2041	tmp->u.gen5.ve_id = gen4_choose_composite_emitter(sna, tmp);
2042
2043	tmp->blt   = gen5_render_composite_blt;
2044	tmp->box   = gen5_render_composite_box;
2045	tmp->boxes = gen5_render_composite_boxes__blt;
2046	if (tmp->emit_boxes) {
2047		tmp->boxes = gen5_render_composite_boxes;
2048		tmp->thread_boxes = gen5_render_composite_boxes__thread;
2049	}
2050	tmp->done  = gen5_render_composite_done;
2051
2052	if (!kgem_check_bo(&sna->kgem,
2053			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo, NULL)) {
2054		kgem_submit(&sna->kgem);
2055		if (!kgem_check_bo(&sna->kgem,
2056				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo, NULL))
2057			goto cleanup_mask;
2058	}
2059
2060	gen5_align_vertex(sna, tmp);
2061	gen5_bind_surfaces(sna, tmp);
2062	return true;
2063
2064cleanup_mask:
2065	if (tmp->mask.bo) {
2066		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
2067		tmp->mask.bo = NULL;
2068	}
2069cleanup_src:
2070	if (tmp->src.bo) {
2071		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
2072		tmp->src.bo = NULL;
2073	}
2074cleanup_dst:
2075	if (tmp->redirect.real_bo) {
2076		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
2077		tmp->redirect.real_bo = NULL;
2078	}
2079fallback:
2080	return (mask == NULL &&
2081		sna_blt_composite(sna, op,
2082				  src, dst,
2083				  src_x, src_y,
2084				  dst_x, dst_y,
2085				  width, height,
2086				  flags | COMPOSITE_FALLBACK, tmp));
2087}
2088
2089#if !NO_COMPOSITE_SPANS
2090fastcall static void
2091gen5_render_composite_spans_box(struct sna *sna,
2092				const struct sna_composite_spans_op *op,
2093				const BoxRec *box, float opacity)
2094{
2095	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
2096	     __FUNCTION__,
2097	     op->base.src.offset[0], op->base.src.offset[1],
2098	     opacity,
2099	     op->base.dst.x, op->base.dst.y,
2100	     box->x1, box->y1,
2101	     box->x2 - box->x1,
2102	     box->y2 - box->y1));
2103
2104	gen5_get_rectangles(sna, &op->base, 1, gen5_bind_surfaces);
2105	op->prim_emit(sna, op, box, opacity);
2106}
2107
2108static void
2109gen5_render_composite_spans_boxes(struct sna *sna,
2110				  const struct sna_composite_spans_op *op,
2111				  const BoxRec *box, int nbox,
2112				  float opacity)
2113{
2114	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
2115	     __FUNCTION__, nbox,
2116	     op->base.src.offset[0], op->base.src.offset[1],
2117	     opacity,
2118	     op->base.dst.x, op->base.dst.y));
2119
2120	do {
2121		int nbox_this_time;
2122
2123		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
2124						     gen5_bind_surfaces);
2125		nbox -= nbox_this_time;
2126
2127		do {
2128			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
2129			     box->x1, box->y1,
2130			     box->x2 - box->x1,
2131			     box->y2 - box->y1));
2132
2133			op->prim_emit(sna, op, box++, opacity);
2134		} while (--nbox_this_time);
2135	} while (nbox);
2136}
2137
2138fastcall static void
2139gen5_render_composite_spans_boxes__thread(struct sna *sna,
2140					  const struct sna_composite_spans_op *op,
2141					  const struct sna_opacity_box *box,
2142					  int nbox)
2143{
2144	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
2145	     __FUNCTION__, nbox,
2146	     op->base.src.offset[0], op->base.src.offset[1],
2147	     op->base.dst.x, op->base.dst.y));
2148
2149	sna_vertex_lock(&sna->render);
2150	do {
2151		int nbox_this_time;
2152		float *v;
2153
2154		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
2155						     gen5_bind_surfaces);
2156		assert(nbox_this_time);
2157		nbox -= nbox_this_time;
2158
2159		v = sna->render.vertices + sna->render.vertex_used;
2160		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
2161
2162		sna_vertex_acquire__locked(&sna->render);
2163		sna_vertex_unlock(&sna->render);
2164
2165		op->emit_boxes(op, box, nbox_this_time, v);
2166		box += nbox_this_time;
2167
2168		sna_vertex_lock(&sna->render);
2169		sna_vertex_release__locked(&sna->render);
2170	} while (nbox);
2171	sna_vertex_unlock(&sna->render);
2172}
2173
2174fastcall static void
2175gen5_render_composite_spans_done(struct sna *sna,
2176				 const struct sna_composite_spans_op *op)
2177{
2178	if (sna->render.vertex_offset)
2179		gen4_vertex_flush(sna);
2180
2181	DBG(("%s()\n", __FUNCTION__));
2182
2183	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2184	sna_render_composite_redirect_done(sna, &op->base);
2185}
2186
2187static bool
2188gen5_check_composite_spans(struct sna *sna,
2189			   uint8_t op, PicturePtr src, PicturePtr dst,
2190			   int16_t width, int16_t height,
2191			   unsigned flags)
2192{
2193	DBG(("%s: op=%d, width=%d, height=%d, flags=%x\n",
2194	     __FUNCTION__, op, width, height, flags));
2195
2196	if (op >= ARRAY_SIZE(gen5_blend_op))
2197		return false;
2198
2199	if (gen5_composite_fallback(sna, src, NULL, dst)) {
2200		DBG(("%s: operation would fallback\n", __FUNCTION__));
2201		return false;
2202	}
2203
2204	if (need_tiling(sna, width, height) &&
2205	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
2206		DBG(("%s: fallback, tiled operation not on GPU\n",
2207		     __FUNCTION__));
2208		return false;
2209	}
2210
2211	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0) {
2212		struct sna_pixmap *priv = sna_pixmap_from_drawable(dst->pDrawable);
2213		assert(priv);
2214
2215		if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
2216			return true;
2217
2218		if (flags & COMPOSITE_SPANS_INPLACE_HINT)
2219			return false;
2220
2221		if ((sna->render.prefer_gpu & PREFER_GPU_SPANS) == 0 &&
2222		    dst->format == PICT_a8)
2223			return false;
2224
2225		return priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo);
2226	}
2227
2228	return true;
2229}
2230
2231static bool
2232gen5_render_composite_spans(struct sna *sna,
2233			    uint8_t op,
2234			    PicturePtr src,
2235			    PicturePtr dst,
2236			    int16_t src_x,  int16_t src_y,
2237			    int16_t dst_x,  int16_t dst_y,
2238			    int16_t width,  int16_t height,
2239			    unsigned flags,
2240			    struct sna_composite_spans_op *tmp)
2241{
2242	DBG(("%s: %dx%d with flags=%x, current mode=%d\n", __FUNCTION__,
2243	     width, height, flags, sna->kgem.ring));
2244
2245	assert(gen5_check_composite_spans(sna, op, src, dst, width, height, flags));
2246
2247	if (need_tiling(sna, width, height)) {
2248		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
2249		     __FUNCTION__, width, height));
2250		return sna_tiling_composite_spans(op, src, dst,
2251						  src_x, src_y, dst_x, dst_y,
2252						  width, height, flags, tmp);
2253	}
2254
2255	tmp->base.op = op;
2256	if (!gen5_composite_set_target(sna, &tmp->base, dst,
2257				       dst_x, dst_y, width, height,
2258				       true))
2259		return false;
2260
2261	switch (gen5_composite_picture(sna, src, &tmp->base.src,
2262				       src_x, src_y,
2263				       width, height,
2264				       dst_x, dst_y,
2265				       dst->polyMode == PolyModePrecise)) {
2266	case -1:
2267		goto cleanup_dst;
2268	case 0:
2269		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
2270			goto cleanup_dst;
2271		/* fall through to fixup */
2272	case 1:
2273		gen5_composite_channel_convert(&tmp->base.src);
2274		break;
2275	}
2276
2277	tmp->base.mask.bo = NULL;
2278
2279	tmp->base.is_affine = tmp->base.src.is_affine;
2280	tmp->base.has_component_alpha = false;
2281	tmp->base.need_magic_ca_pass = false;
2282
2283	tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(sna, tmp);
2284	tmp->base.u.gen5.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
2285
2286	tmp->box   = gen5_render_composite_spans_box;
2287	tmp->boxes = gen5_render_composite_spans_boxes;
2288	if (tmp->emit_boxes)
2289		tmp->thread_boxes = gen5_render_composite_spans_boxes__thread;
2290	tmp->done  = gen5_render_composite_spans_done;
2291
2292	if (!kgem_check_bo(&sna->kgem,
2293			   tmp->base.dst.bo, tmp->base.src.bo,
2294			   NULL))  {
2295		kgem_submit(&sna->kgem);
2296		if (!kgem_check_bo(&sna->kgem,
2297				   tmp->base.dst.bo, tmp->base.src.bo,
2298				   NULL))
2299			goto cleanup_src;
2300	}
2301
2302	gen5_align_vertex(sna, &tmp->base);
2303	gen5_bind_surfaces(sna, &tmp->base);
2304	return true;
2305
2306cleanup_src:
2307	if (tmp->base.src.bo)
2308		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
2309cleanup_dst:
2310	if (tmp->base.redirect.real_bo)
2311		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
2312	return false;
2313}
2314#endif
2315
2316static void
2317gen5_copy_bind_surfaces(struct sna *sna,
2318			const struct sna_composite_op *op)
2319{
2320	bool dirty = kgem_bo_is_dirty(op->dst.bo);
2321	uint32_t *binding_table;
2322	uint16_t offset;
2323
2324	gen5_get_batch(sna, op);
2325
2326	binding_table = gen5_composite_get_binding_table(sna, &offset);
2327
2328	binding_table[0] =
2329		gen5_bind_bo(sna,
2330			     op->dst.bo, op->dst.width, op->dst.height,
2331			     gen5_get_dest_format(op->dst.format),
2332			     true);
2333	binding_table[1] =
2334		gen5_bind_bo(sna,
2335			     op->src.bo, op->src.width, op->src.height,
2336			     op->src.card_format,
2337			     false);
2338
2339	if (sna->kgem.surface == offset &&
2340	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen5.surface_table) == *(uint64_t*)binding_table) {
2341		sna->kgem.surface += sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
2342		offset = sna->render_state.gen5.surface_table;
2343	}
2344
2345	gen5_emit_state(sna, op, offset | dirty);
2346}
2347
2348static bool
2349gen5_render_copy_boxes(struct sna *sna, uint8_t alu,
2350		       const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
2351		       const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
2352		       const BoxRec *box, int n, unsigned flags)
2353{
2354	struct sna_composite_op tmp;
2355
2356	DBG(("%s alu=%d, src=%ld:handle=%d, dst=%ld:handle=%d boxes=%d x [((%d, %d), (%d, %d))...], flags=%x\n",
2357	     __FUNCTION__, alu,
2358	     src->serialNumber, src_bo->handle,
2359	     dst->serialNumber, dst_bo->handle,
2360	     n, box->x1, box->y1, box->x2, box->y2,
2361	     flags));
2362
2363	if (sna_blt_compare_depth(src, dst) &&
2364	    sna_blt_copy_boxes(sna, alu,
2365			       src_bo, src_dx, src_dy,
2366			       dst_bo, dst_dx, dst_dy,
2367			       dst->bitsPerPixel,
2368			       box, n))
2369		return true;
2370
2371	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo) {
2372fallback_blt:
2373		if (!sna_blt_compare_depth(src, dst))
2374			return false;
2375
2376		return sna_blt_copy_boxes_fallback(sna, alu,
2377						   src, src_bo, src_dx, src_dy,
2378						   dst, dst_bo, dst_dx, dst_dy,
2379						   box, n);
2380	}
2381
2382	memset(&tmp, 0, sizeof(tmp));
2383
2384	if (dst->depth == src->depth) {
2385		tmp.dst.format = sna_render_format_for_depth(dst->depth);
2386		tmp.src.pict_format = tmp.dst.format;
2387	} else {
2388		tmp.dst.format = sna_format_for_depth(dst->depth);
2389		tmp.src.pict_format = sna_format_for_depth(src->depth);
2390	}
2391	if (!gen5_check_format(tmp.src.pict_format)) {
2392		DBG(("%s: unsupported source format, %x, use BLT\n",
2393		     __FUNCTION__, tmp.src.pict_format));
2394		goto fallback_blt;
2395	}
2396
2397	DBG(("%s (%d, %d)->(%d, %d) x %d\n",
2398	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n));
2399
2400	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
2401
2402	tmp.dst.pixmap = (PixmapPtr)dst;
2403	tmp.dst.width  = dst->width;
2404	tmp.dst.height = dst->height;
2405	tmp.dst.x = tmp.dst.y = 0;
2406	tmp.dst.bo = dst_bo;
2407	tmp.damage = NULL;
2408
2409	sna_render_composite_redirect_init(&tmp);
2410	if (too_large(tmp.dst.width, tmp.dst.height)) {
2411		BoxRec extents = box[0];
2412		int i;
2413
2414		for (i = 1; i < n; i++) {
2415			if (box[i].x1 < extents.x1)
2416				extents.x1 = box[i].x1;
2417			if (box[i].y1 < extents.y1)
2418				extents.y1 = box[i].y1;
2419
2420			if (box[i].x2 > extents.x2)
2421				extents.x2 = box[i].x2;
2422			if (box[i].y2 > extents.y2)
2423				extents.y2 = box[i].y2;
2424		}
2425		if (!sna_render_composite_redirect(sna, &tmp,
2426						   extents.x1 + dst_dx,
2427						   extents.y1 + dst_dy,
2428						   extents.x2 - extents.x1,
2429						   extents.y2 - extents.y1,
2430						   n > 1))
2431			goto fallback_tiled;
2432	}
2433
2434	tmp.src.filter = SAMPLER_FILTER_NEAREST;
2435	tmp.src.repeat = SAMPLER_EXTEND_NONE;
2436	tmp.src.card_format = gen5_get_card_format(tmp.src.pict_format);
2437	if (too_large(src->width, src->height)) {
2438		BoxRec extents = box[0];
2439		int i;
2440
2441		for (i = 1; i < n; i++) {
2442			if (box[i].x1 < extents.x1)
2443				extents.x1 = box[i].x1;
2444			if (box[i].y1 < extents.y1)
2445				extents.y1 = box[i].y1;
2446
2447			if (box[i].x2 > extents.x2)
2448				extents.x2 = box[i].x2;
2449			if (box[i].y2 > extents.y2)
2450				extents.y2 = box[i].y2;
2451		}
2452
2453		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
2454					       extents.x1 + src_dx,
2455					       extents.y1 + src_dy,
2456					       extents.x2 - extents.x1,
2457					       extents.y2 - extents.y1))
2458			goto fallback_tiled_dst;
2459	} else {
2460		tmp.src.bo = kgem_bo_reference(src_bo);
2461		tmp.src.width  = src->width;
2462		tmp.src.height = src->height;
2463		tmp.src.offset[0] = tmp.src.offset[1] = 0;
2464		tmp.src.scale[0] = 1.f/src->width;
2465		tmp.src.scale[1] = 1.f/src->height;
2466	}
2467
2468	tmp.is_affine = true;
2469	tmp.floats_per_vertex = 3;
2470	tmp.floats_per_rect = 9;
2471	tmp.u.gen5.wm_kernel = WM_KERNEL;
2472	tmp.u.gen5.ve_id = 2;
2473
2474	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2475		kgem_submit(&sna->kgem);
2476		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2477			DBG(("%s: aperture check failed\n", __FUNCTION__));
2478			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2479			if (tmp.redirect.real_bo)
2480				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2481
2482			goto fallback_blt;
2483		}
2484	}
2485
2486	dst_dx += tmp.dst.x;
2487	dst_dy += tmp.dst.y;
2488	tmp.dst.x = tmp.dst.y = 0;
2489
2490	src_dx += tmp.src.offset[0];
2491	src_dy += tmp.src.offset[1];
2492
2493	gen5_align_vertex(sna, &tmp);
2494	gen5_copy_bind_surfaces(sna, &tmp);
2495
2496	do {
2497		int n_this_time;
2498
2499		n_this_time = gen5_get_rectangles(sna, &tmp, n,
2500						  gen5_copy_bind_surfaces);
2501		n -= n_this_time;
2502
2503		do {
2504			DBG(("	(%d, %d) -> (%d, %d) + (%d, %d)\n",
2505			     box->x1 + src_dx, box->y1 + src_dy,
2506			     box->x1 + dst_dx, box->y1 + dst_dy,
2507			     box->x2 - box->x1, box->y2 - box->y1));
2508			OUT_VERTEX(box->x2 + dst_dx, box->y2 + dst_dy);
2509			OUT_VERTEX_F((box->x2 + src_dx) * tmp.src.scale[0]);
2510			OUT_VERTEX_F((box->y2 + src_dy) * tmp.src.scale[1]);
2511
2512			OUT_VERTEX(box->x1 + dst_dx, box->y2 + dst_dy);
2513			OUT_VERTEX_F((box->x1 + src_dx) * tmp.src.scale[0]);
2514			OUT_VERTEX_F((box->y2 + src_dy) * tmp.src.scale[1]);
2515
2516			OUT_VERTEX(box->x1 + dst_dx, box->y1 + dst_dy);
2517			OUT_VERTEX_F((box->x1 + src_dx) * tmp.src.scale[0]);
2518			OUT_VERTEX_F((box->y1 + src_dy) * tmp.src.scale[1]);
2519
2520			box++;
2521		} while (--n_this_time);
2522	} while (n);
2523
2524	gen4_vertex_flush(sna);
2525	sna_render_composite_redirect_done(sna, &tmp);
2526	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2527	return true;
2528
2529fallback_tiled_dst:
2530	if (tmp.redirect.real_bo)
2531		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2532fallback_tiled:
2533	if (sna_blt_compare_depth(src, dst) &&
2534	    sna_blt_copy_boxes(sna, alu,
2535			       src_bo, src_dx, src_dy,
2536			       dst_bo, dst_dx, dst_dy,
2537			       dst->bitsPerPixel,
2538			       box, n))
2539		return true;
2540
2541	DBG(("%s: tiled fallback\n", __FUNCTION__));
2542	return sna_tiling_copy_boxes(sna, alu,
2543				     src, src_bo, src_dx, src_dy,
2544				     dst, dst_bo, dst_dx, dst_dy,
2545				     box, n);
2546}
2547
2548static void
2549gen5_render_copy_blt(struct sna *sna,
2550		     const struct sna_copy_op *op,
2551		     int16_t sx, int16_t sy,
2552		     int16_t w,  int16_t h,
2553		     int16_t dx, int16_t dy)
2554{
2555	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n", __FUNCTION__,
2556	     sx, sy, dx, dy, w, h));
2557
2558	gen5_get_rectangles(sna, &op->base, 1, gen5_copy_bind_surfaces);
2559
2560	OUT_VERTEX(dx+w, dy+h);
2561	OUT_VERTEX_F((sx+w)*op->base.src.scale[0]);
2562	OUT_VERTEX_F((sy+h)*op->base.src.scale[1]);
2563
2564	OUT_VERTEX(dx, dy+h);
2565	OUT_VERTEX_F(sx*op->base.src.scale[0]);
2566	OUT_VERTEX_F((sy+h)*op->base.src.scale[1]);
2567
2568	OUT_VERTEX(dx, dy);
2569	OUT_VERTEX_F(sx*op->base.src.scale[0]);
2570	OUT_VERTEX_F(sy*op->base.src.scale[1]);
2571}
2572
2573static void
2574gen5_render_copy_done(struct sna *sna,
2575		      const struct sna_copy_op *op)
2576{
2577	if (sna->render.vertex_offset)
2578		gen4_vertex_flush(sna);
2579
2580	DBG(("%s()\n", __FUNCTION__));
2581}
2582
2583static bool
2584gen5_render_copy(struct sna *sna, uint8_t alu,
2585		 PixmapPtr src, struct kgem_bo *src_bo,
2586		 PixmapPtr dst, struct kgem_bo *dst_bo,
2587		 struct sna_copy_op *op)
2588{
2589	DBG(("%s (alu=%d)\n", __FUNCTION__, alu));
2590
2591	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2592	    sna_blt_copy(sna, alu,
2593			 src_bo, dst_bo,
2594			 dst->drawable.bitsPerPixel,
2595			 op))
2596		return true;
2597
2598	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
2599	    too_large(src->drawable.width, src->drawable.height) ||
2600	    too_large(dst->drawable.width, dst->drawable.height)) {
2601fallback:
2602		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
2603			return false;
2604
2605		return sna_blt_copy(sna, alu, src_bo, dst_bo,
2606				    dst->drawable.bitsPerPixel,
2607				    op);
2608	}
2609
2610	if (dst->drawable.depth == src->drawable.depth) {
2611		op->base.dst.format = sna_render_format_for_depth(dst->drawable.depth);
2612		op->base.src.pict_format = op->base.dst.format;
2613	} else {
2614		op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
2615		op->base.src.pict_format = sna_format_for_depth(src->drawable.depth);
2616	}
2617	if (!gen5_check_format(op->base.src.pict_format))
2618		goto fallback;
2619
2620	op->base.op = alu == GXcopy ? PictOpSrc : PictOpClear;
2621
2622	op->base.dst.pixmap = dst;
2623	op->base.dst.width  = dst->drawable.width;
2624	op->base.dst.height = dst->drawable.height;
2625	op->base.dst.bo = dst_bo;
2626
2627	op->base.src.bo = src_bo;
2628	op->base.src.card_format =
2629		gen5_get_card_format(op->base.src.pict_format);
2630	op->base.src.width  = src->drawable.width;
2631	op->base.src.height = src->drawable.height;
2632	op->base.src.scale[0] = 1.f/src->drawable.width;
2633	op->base.src.scale[1] = 1.f/src->drawable.height;
2634	op->base.src.filter = SAMPLER_FILTER_NEAREST;
2635	op->base.src.repeat = SAMPLER_EXTEND_NONE;
2636
2637	op->base.is_affine = true;
2638	op->base.floats_per_vertex = 3;
2639	op->base.floats_per_rect = 9;
2640	op->base.u.gen5.wm_kernel = WM_KERNEL;
2641	op->base.u.gen5.ve_id = 2;
2642
2643	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2644		kgem_submit(&sna->kgem);
2645		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
2646			goto fallback;
2647	}
2648
2649	if (kgem_bo_is_dirty(src_bo)) {
2650		if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2651		    sna_blt_copy(sna, alu,
2652				 src_bo, dst_bo,
2653				 dst->drawable.bitsPerPixel,
2654				 op))
2655			return true;
2656	}
2657
2658	gen5_align_vertex(sna, &op->base);
2659	gen5_copy_bind_surfaces(sna, &op->base);
2660
2661	op->blt  = gen5_render_copy_blt;
2662	op->done = gen5_render_copy_done;
2663	return true;
2664}
2665
2666static void
2667gen5_fill_bind_surfaces(struct sna *sna,
2668			const struct sna_composite_op *op)
2669{
2670	bool dirty = kgem_bo_is_dirty(op->dst.bo);
2671	uint32_t *binding_table;
2672	uint16_t offset;
2673
2674	gen5_get_batch(sna, op);
2675
2676	binding_table = gen5_composite_get_binding_table(sna, &offset);
2677
2678	binding_table[0] =
2679		gen5_bind_bo(sna,
2680			     op->dst.bo, op->dst.width, op->dst.height,
2681			     gen5_get_dest_format(op->dst.format),
2682			     true);
2683	binding_table[1] =
2684		gen5_bind_bo(sna,
2685			     op->src.bo, 1, 1,
2686			     GEN5_SURFACEFORMAT_B8G8R8A8_UNORM,
2687			     false);
2688
2689	if (sna->kgem.surface == offset &&
2690	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen5.surface_table) == *(uint64_t*)binding_table) {
2691		sna->kgem.surface +=
2692			sizeof(struct gen5_surface_state_padded)/sizeof(uint32_t);
2693		offset = sna->render_state.gen5.surface_table;
2694	}
2695
2696	gen5_emit_state(sna, op, offset | dirty);
2697}
2698
2699static inline bool prefer_blt_fill(struct sna *sna)
2700{
2701#if PREFER_BLT_FILL
2702	return true;
2703#else
2704	return sna->kgem.mode != KGEM_RENDER;
2705#endif
2706}
2707
2708static bool
2709gen5_render_fill_boxes(struct sna *sna,
2710		       CARD8 op,
2711		       PictFormat format,
2712		       const xRenderColor *color,
2713		       const DrawableRec *dst, struct kgem_bo *dst_bo,
2714		       const BoxRec *box, int n)
2715{
2716	struct sna_composite_op tmp;
2717	uint32_t pixel;
2718
2719	DBG(("%s op=%x, color=(%04x,%04x,%04x,%04x), boxes=%d x [((%d, %d), (%d, %d))...]\n",
2720	     __FUNCTION__, op,
2721	     color->red, color->green, color->blue, color->alpha,
2722	     n, box->x1, box->y1, box->x2, box->y2));
2723
2724	if (op >= ARRAY_SIZE(gen5_blend_op)) {
2725		DBG(("%s: fallback due to unhandled blend op: %d\n",
2726		     __FUNCTION__, op));
2727		return false;
2728	}
2729
2730	if (op <= PictOpSrc &&
2731	    (prefer_blt_fill(sna) ||
2732	     too_large(dst->width, dst->height) ||
2733	     !gen5_check_dst_format(format))) {
2734		uint8_t alu = GXinvalid;
2735
2736		pixel = 0;
2737		if (op == PictOpClear)
2738			alu = GXclear;
2739		else if (sna_get_pixel_from_rgba(&pixel,
2740						 color->red,
2741						 color->green,
2742						 color->blue,
2743						 color->alpha,
2744						 format))
2745			alu = GXcopy;
2746
2747		if (alu != GXinvalid &&
2748		    sna_blt_fill_boxes(sna, alu,
2749				       dst_bo, dst->bitsPerPixel,
2750				       pixel, box, n))
2751			return true;
2752
2753		if (!gen5_check_dst_format(format))
2754			return false;
2755
2756		if (too_large(dst->width, dst->height))
2757			return sna_tiling_fill_boxes(sna, op, format, color,
2758						     dst, dst_bo, box, n);
2759	}
2760
2761	if (op == PictOpClear) {
2762		pixel = 0;
2763		op = PictOpSrc;
2764	} else if (!sna_get_pixel_from_rgba(&pixel,
2765					    color->red,
2766					    color->green,
2767					    color->blue,
2768					    color->alpha,
2769					    PICT_a8r8g8b8))
2770		return false;
2771
2772	DBG(("%s(%08x x %d)\n", __FUNCTION__, pixel, n));
2773
2774	memset(&tmp, 0, sizeof(tmp));
2775
2776	tmp.op = op;
2777
2778	tmp.dst.pixmap = (PixmapPtr)dst;
2779	tmp.dst.width  = dst->width;
2780	tmp.dst.height = dst->height;
2781	tmp.dst.format = format;
2782	tmp.dst.bo = dst_bo;
2783
2784	if (too_large(dst->width, dst->height)) {
2785		BoxRec extents;
2786
2787		boxes_extents(box, n, &extents);
2788		if (!sna_render_composite_redirect(sna, &tmp,
2789						   extents.x1, extents.y1,
2790						   extents.x2 - extents.x1,
2791						   extents.y2 - extents.y1,
2792						   n > 1))
2793			return sna_tiling_fill_boxes(sna, op, format, color,
2794						     dst, dst_bo, box, n);
2795	}
2796
2797	tmp.src.bo = sna_render_get_solid(sna, pixel);
2798	tmp.src.filter = SAMPLER_FILTER_NEAREST;
2799	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
2800
2801	tmp.is_affine = true;
2802	tmp.floats_per_vertex = 2;
2803	tmp.floats_per_rect = 6;
2804	tmp.u.gen5.wm_kernel = WM_KERNEL;
2805	tmp.u.gen5.ve_id = 1;
2806
2807	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2808		kgem_submit(&sna->kgem);
2809		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2810			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2811			return false;
2812		}
2813	}
2814
2815	gen5_align_vertex(sna, &tmp);
2816	gen5_fill_bind_surfaces(sna, &tmp);
2817
2818	do {
2819		int n_this_time;
2820
2821		n_this_time = gen5_get_rectangles(sna, &tmp, n,
2822						  gen5_fill_bind_surfaces);
2823		n -= n_this_time;
2824
2825		do {
2826			DBG(("	(%d, %d), (%d, %d)\n",
2827			     box->x1, box->y1, box->x2, box->y2));
2828			OUT_VERTEX(box->x2, box->y2);
2829			OUT_VERTEX_F(.5);
2830
2831			OUT_VERTEX(box->x1, box->y2);
2832			OUT_VERTEX_F(.5);
2833
2834			OUT_VERTEX(box->x1, box->y1);
2835			OUT_VERTEX_F(.5);
2836
2837			box++;
2838		} while (--n_this_time);
2839	} while (n);
2840
2841	gen4_vertex_flush(sna);
2842	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2843	sna_render_composite_redirect_done(sna, &tmp);
2844	return true;
2845}
2846
2847static void
2848gen5_render_fill_op_blt(struct sna *sna,
2849			const struct sna_fill_op *op,
2850			int16_t x, int16_t y, int16_t w, int16_t h)
2851{
2852	DBG(("%s (%d, %d)x(%d, %d)\n", __FUNCTION__, x,y,w,h));
2853
2854	gen5_get_rectangles(sna, &op->base, 1, gen5_fill_bind_surfaces);
2855
2856	OUT_VERTEX(x+w, y+h);
2857	OUT_VERTEX_F(.5);
2858
2859	OUT_VERTEX(x, y+h);
2860	OUT_VERTEX_F(.5);
2861
2862	OUT_VERTEX(x, y);
2863	OUT_VERTEX_F(.5);
2864}
2865
2866fastcall static void
2867gen5_render_fill_op_box(struct sna *sna,
2868			const struct sna_fill_op *op,
2869			const BoxRec *box)
2870{
2871	DBG(("%s: (%d, %d),(%d, %d)\n", __FUNCTION__,
2872	     box->x1, box->y1, box->x2, box->y2));
2873
2874	gen5_get_rectangles(sna, &op->base, 1, gen5_fill_bind_surfaces);
2875
2876	OUT_VERTEX(box->x2, box->y2);
2877	OUT_VERTEX_F(.5);
2878
2879	OUT_VERTEX(box->x1, box->y2);
2880	OUT_VERTEX_F(.5);
2881
2882	OUT_VERTEX(box->x1, box->y1);
2883	OUT_VERTEX_F(.5);
2884}
2885
2886fastcall static void
2887gen5_render_fill_op_boxes(struct sna *sna,
2888			  const struct sna_fill_op *op,
2889			  const BoxRec *box,
2890			  int nbox)
2891{
2892	DBG(("%s: (%d, %d),(%d, %d)... x %d\n", __FUNCTION__,
2893	     box->x1, box->y1, box->x2, box->y2, nbox));
2894
2895	do {
2896		int nbox_this_time;
2897
2898		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
2899						     gen5_fill_bind_surfaces);
2900		nbox -= nbox_this_time;
2901
2902		do {
2903			OUT_VERTEX(box->x2, box->y2);
2904			OUT_VERTEX_F(.5);
2905
2906			OUT_VERTEX(box->x1, box->y2);
2907			OUT_VERTEX_F(.5);
2908
2909			OUT_VERTEX(box->x1, box->y1);
2910			OUT_VERTEX_F(.5);
2911			box++;
2912		} while (--nbox_this_time);
2913	} while (nbox);
2914}
2915
2916static void
2917gen5_render_fill_op_done(struct sna *sna,
2918			 const struct sna_fill_op *op)
2919{
2920	if (sna->render.vertex_offset)
2921		gen4_vertex_flush(sna);
2922	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2923
2924	DBG(("%s()\n", __FUNCTION__));
2925}
2926
2927static bool
2928gen5_render_fill(struct sna *sna, uint8_t alu,
2929		 PixmapPtr dst, struct kgem_bo *dst_bo,
2930		 uint32_t color, unsigned flags,
2931		 struct sna_fill_op *op)
2932{
2933	DBG(("%s(alu=%d, color=%08x)\n", __FUNCTION__, alu, color));
2934
2935	if (prefer_blt_fill(sna) &&
2936	    sna_blt_fill(sna, alu,
2937			 dst_bo, dst->drawable.bitsPerPixel,
2938			 color,
2939			 op))
2940		return true;
2941
2942	if (!(alu == GXcopy || alu == GXclear) ||
2943	    too_large(dst->drawable.width, dst->drawable.height))
2944		return sna_blt_fill(sna, alu,
2945				    dst_bo, dst->drawable.bitsPerPixel,
2946				    color,
2947				    op);
2948
2949	if (alu == GXclear)
2950		color = 0;
2951
2952	op->base.op = color == 0 ? PictOpClear : PictOpSrc;
2953
2954	op->base.dst.pixmap = dst;
2955	op->base.dst.width  = dst->drawable.width;
2956	op->base.dst.height = dst->drawable.height;
2957	op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
2958	op->base.dst.bo = dst_bo;
2959	op->base.dst.x = op->base.dst.y = 0;
2960
2961	op->base.need_magic_ca_pass = 0;
2962	op->base.has_component_alpha = 0;
2963
2964	op->base.src.bo =
2965		sna_render_get_solid(sna,
2966				     sna_rgba_for_color(color,
2967							dst->drawable.depth));
2968	op->base.src.filter = SAMPLER_FILTER_NEAREST;
2969	op->base.src.repeat = SAMPLER_EXTEND_REPEAT;
2970
2971	op->base.mask.bo = NULL;
2972	op->base.mask.filter = SAMPLER_FILTER_NEAREST;
2973	op->base.mask.repeat = SAMPLER_EXTEND_NONE;
2974
2975	op->base.is_affine = true;
2976	op->base.floats_per_vertex = 2;
2977	op->base.floats_per_rect = 6;
2978	op->base.u.gen5.wm_kernel = WM_KERNEL;
2979	op->base.u.gen5.ve_id = 1;
2980
2981	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2982		kgem_submit(&sna->kgem);
2983		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2984			kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2985			return false;
2986		}
2987	}
2988
2989	gen5_align_vertex(sna, &op->base);
2990	gen5_fill_bind_surfaces(sna, &op->base);
2991
2992	op->blt   = gen5_render_fill_op_blt;
2993	op->box   = gen5_render_fill_op_box;
2994	op->boxes = gen5_render_fill_op_boxes;
2995	op->points = NULL;
2996	op->done  = gen5_render_fill_op_done;
2997	return true;
2998}
2999
3000static bool
3001gen5_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3002			     uint32_t color,
3003			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
3004			     uint8_t alu)
3005{
3006	BoxRec box;
3007
3008	box.x1 = x1;
3009	box.y1 = y1;
3010	box.x2 = x2;
3011	box.y2 = y2;
3012
3013	return sna_blt_fill_boxes(sna, alu,
3014				  bo, dst->drawable.bitsPerPixel,
3015				  color, &box, 1);
3016}
3017
3018static bool
3019gen5_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
3020		     uint32_t color,
3021		     int16_t x1, int16_t y1,
3022		     int16_t x2, int16_t y2,
3023		     uint8_t alu)
3024{
3025	struct sna_composite_op tmp;
3026
3027#if NO_FILL_ONE
3028	return gen5_render_fill_one_try_blt(sna, dst, bo, color,
3029					    x1, y1, x2, y2, alu);
3030#endif
3031
3032	/* Prefer to use the BLT if already engaged */
3033	if (prefer_blt_fill(sna) &&
3034	    gen5_render_fill_one_try_blt(sna, dst, bo, color,
3035					 x1, y1, x2, y2, alu))
3036		return true;
3037
3038	/* Must use the BLT if we can't RENDER... */
3039	if (!(alu == GXcopy || alu == GXclear) ||
3040	    too_large(dst->drawable.width, dst->drawable.height))
3041		return gen5_render_fill_one_try_blt(sna, dst, bo, color,
3042						    x1, y1, x2, y2, alu);
3043
3044	if (alu == GXclear)
3045		color = 0;
3046
3047	tmp.op = color == 0 ? PictOpClear : PictOpSrc;
3048
3049	tmp.dst.pixmap = dst;
3050	tmp.dst.width  = dst->drawable.width;
3051	tmp.dst.height = dst->drawable.height;
3052	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
3053	tmp.dst.bo = bo;
3054	tmp.dst.x = tmp.dst.y = 0;
3055
3056	tmp.src.bo =
3057		sna_render_get_solid(sna,
3058				     sna_rgba_for_color(color,
3059							dst->drawable.depth));
3060	tmp.src.filter = SAMPLER_FILTER_NEAREST;
3061	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
3062
3063	tmp.mask.bo = NULL;
3064	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
3065	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
3066
3067	tmp.is_affine = true;
3068	tmp.floats_per_vertex = 2;
3069	tmp.floats_per_rect = 6;
3070	tmp.has_component_alpha = 0;
3071	tmp.need_magic_ca_pass = false;
3072
3073	tmp.u.gen5.wm_kernel = WM_KERNEL;
3074	tmp.u.gen5.ve_id = 1;
3075
3076	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3077		kgem_submit(&sna->kgem);
3078		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
3079			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3080			return false;
3081		}
3082	}
3083
3084	gen5_align_vertex(sna, &tmp);
3085	gen5_fill_bind_surfaces(sna, &tmp);
3086
3087	gen5_get_rectangles(sna, &tmp, 1, gen5_fill_bind_surfaces);
3088
3089	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
3090	OUT_VERTEX(x2, y2);
3091	OUT_VERTEX_F(.5);
3092
3093	OUT_VERTEX(x1, y2);
3094	OUT_VERTEX_F(.5);
3095
3096	OUT_VERTEX(x1, y1);
3097	OUT_VERTEX_F(.5);
3098
3099	gen4_vertex_flush(sna);
3100	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
3101
3102	return true;
3103}
3104static void
3105gen5_render_context_switch(struct kgem *kgem,
3106			   int new_mode)
3107{
3108	if (!kgem->nbatch)
3109		return;
3110
3111	/* WaNonPipelinedStateCommandFlush
3112	 *
3113	 * Ironlake has a limitation that a 3D or Media command can't
3114	 * be the first command after a BLT, unless it's
3115	 * non-pipelined.
3116	 *
3117	 * We do this by ensuring that the non-pipelined drawrect
3118	 * is always emitted first following a switch from BLT.
3119	 */
3120	if (kgem->mode == KGEM_BLT) {
3121		struct sna *sna = to_sna_from_kgem(kgem);
3122		DBG(("%s: forcing drawrect on next state emission\n",
3123		     __FUNCTION__));
3124		sna->render_state.gen5.drawrect_limit = -1;
3125	}
3126
3127	if (kgem_ring_is_idle(kgem, kgem->ring)) {
3128		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
3129		_kgem_submit(kgem);
3130	}
3131}
3132
3133static void gen5_render_reset(struct sna *sna)
3134{
3135	sna->render_state.gen5.needs_invariant = true;
3136	sna->render_state.gen5.ve_id = -1;
3137	sna->render_state.gen5.last_primitive = -1;
3138	sna->render_state.gen5.last_pipelined_pointers = 0;
3139
3140	sna->render_state.gen5.drawrect_offset = -1;
3141	sna->render_state.gen5.drawrect_limit = -1;
3142	sna->render_state.gen5.surface_table = -1;
3143
3144	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
3145		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
3146		discard_vbo(sna);
3147	}
3148
3149	sna->render.vertex_offset = 0;
3150	sna->render.nvertex_reloc = 0;
3151	sna->render.vb_id = 0;
3152}
3153
3154static void gen5_render_fini(struct sna *sna)
3155{
3156	kgem_bo_destroy(&sna->kgem, sna->render_state.gen5.general_bo);
3157}
3158
3159static uint32_t gen5_create_vs_unit_state(struct sna_static_stream *stream)
3160{
3161	struct gen5_vs_unit_state *vs = sna_static_stream_map(stream, sizeof(*vs), 32);
3162
3163	/* Set up the vertex shader to be disabled (passthrough) */
3164	vs->thread4.nr_urb_entries = URB_VS_ENTRIES >> 2;
3165	vs->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
3166	vs->vs6.vs_enable = 0;
3167	vs->vs6.vert_cache_disable = 1;
3168
3169	return sna_static_stream_offsetof(stream, vs);
3170}
3171
3172static uint32_t gen5_create_sf_state(struct sna_static_stream *stream,
3173				     uint32_t kernel)
3174{
3175	struct gen5_sf_unit_state *sf_state;
3176
3177	sf_state = sna_static_stream_map(stream, sizeof(*sf_state), 32);
3178
3179	sf_state->thread0.grf_reg_count = GEN5_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
3180	sf_state->thread0.kernel_start_pointer = kernel >> 6;
3181
3182	sf_state->thread3.const_urb_entry_read_length = 0;	/* no const URBs */
3183	sf_state->thread3.const_urb_entry_read_offset = 0;	/* no const URBs */
3184	sf_state->thread3.urb_entry_read_length = 1;	/* 1 URB per vertex */
3185	/* don't smash vertex header, read start from dw8 */
3186	sf_state->thread3.urb_entry_read_offset = 1;
3187	sf_state->thread3.dispatch_grf_start_reg = 3;
3188	sf_state->thread4.max_threads = SF_MAX_THREADS - 1;
3189	sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
3190	sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES;
3191	sf_state->sf5.viewport_transform = false;	/* skip viewport */
3192	sf_state->sf6.cull_mode = GEN5_CULLMODE_NONE;
3193	sf_state->sf6.scissor = 0;
3194	sf_state->sf7.trifan_pv = 2;
3195	sf_state->sf6.dest_org_vbias = 0x8;
3196	sf_state->sf6.dest_org_hbias = 0x8;
3197
3198	return sna_static_stream_offsetof(stream, sf_state);
3199}
3200
3201static uint32_t gen5_create_sampler_state(struct sna_static_stream *stream,
3202					  sampler_filter_t src_filter,
3203					  sampler_extend_t src_extend,
3204					  sampler_filter_t mask_filter,
3205					  sampler_extend_t mask_extend)
3206{
3207	struct gen5_sampler_state *sampler_state;
3208
3209	sampler_state = sna_static_stream_map(stream,
3210					      sizeof(struct gen5_sampler_state) * 2,
3211					      32);
3212	sampler_state_init(&sampler_state[0], src_filter, src_extend);
3213	sampler_state_init(&sampler_state[1], mask_filter, mask_extend);
3214
3215	return sna_static_stream_offsetof(stream, sampler_state);
3216}
3217
3218static void gen5_init_wm_state(struct gen5_wm_unit_state *state,
3219			       bool has_mask,
3220			       uint32_t kernel,
3221			       uint32_t sampler)
3222{
3223	state->thread0.grf_reg_count = GEN5_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
3224	state->thread0.kernel_start_pointer = kernel >> 6;
3225
3226	state->thread1.single_program_flow = 0;
3227
3228	/* scratch space is not used in our kernel */
3229	state->thread2.scratch_space_base_pointer = 0;
3230	state->thread2.per_thread_scratch_space = 0;
3231
3232	state->thread3.const_urb_entry_read_length = 0;
3233	state->thread3.const_urb_entry_read_offset = 0;
3234
3235	state->thread3.urb_entry_read_offset = 0;
3236	/* wm kernel use urb from 3, see wm_program in compiler module */
3237	state->thread3.dispatch_grf_start_reg = 3;	/* must match kernel */
3238
3239	state->wm4.sampler_count = 0;	/* hardware requirement */
3240
3241	state->wm4.sampler_state_pointer = sampler >> 5;
3242	state->wm5.max_threads = PS_MAX_THREADS - 1;
3243	state->wm5.transposed_urb_read = 0;
3244	state->wm5.thread_dispatch_enable = 1;
3245	/* just use 16-pixel dispatch (4 subspans), don't need to change kernel
3246	 * start point
3247	 */
3248	state->wm5.enable_16_pix = 1;
3249	state->wm5.enable_8_pix = 0;
3250	state->wm5.early_depth_test = 1;
3251
3252	/* Each pair of attributes (src/mask coords) is two URB entries */
3253	if (has_mask) {
3254		state->thread1.binding_table_entry_count = 3;	/* 2 tex and fb */
3255		state->thread3.urb_entry_read_length = 4;
3256	} else {
3257		state->thread1.binding_table_entry_count = 2;	/* 1 tex and fb */
3258		state->thread3.urb_entry_read_length = 2;
3259	}
3260
3261	/* binding table entry count is only used for prefetching,
3262	 * and it has to be set 0 for Ironlake
3263	 */
3264	state->thread1.binding_table_entry_count = 0;
3265}
3266
3267static uint32_t gen5_create_cc_unit_state(struct sna_static_stream *stream)
3268{
3269	uint8_t *ptr, *base;
3270	int i, j;
3271
3272	base = ptr =
3273		sna_static_stream_map(stream,
3274				      GEN5_BLENDFACTOR_COUNT*GEN5_BLENDFACTOR_COUNT*64,
3275				      64);
3276
3277	for (i = 0; i < GEN5_BLENDFACTOR_COUNT; i++) {
3278		for (j = 0; j < GEN5_BLENDFACTOR_COUNT; j++) {
3279			struct gen5_cc_unit_state *state =
3280				(struct gen5_cc_unit_state *)ptr;
3281
3282			state->cc3.blend_enable =
3283				!(j == GEN5_BLENDFACTOR_ZERO && i == GEN5_BLENDFACTOR_ONE);
3284
3285			state->cc5.logicop_func = 0xc;	/* COPY */
3286			state->cc5.ia_blend_function = GEN5_BLENDFUNCTION_ADD;
3287
3288			/* Fill in alpha blend factors same as color, for the future. */
3289			state->cc5.ia_src_blend_factor = i;
3290			state->cc5.ia_dest_blend_factor = j;
3291
3292			state->cc6.blend_function = GEN5_BLENDFUNCTION_ADD;
3293			state->cc6.clamp_post_alpha_blend = 1;
3294			state->cc6.clamp_pre_alpha_blend = 1;
3295			state->cc6.src_blend_factor = i;
3296			state->cc6.dest_blend_factor = j;
3297
3298			ptr += 64;
3299		}
3300	}
3301
3302	return sna_static_stream_offsetof(stream, base);
3303}
3304
3305static bool gen5_render_setup(struct sna *sna)
3306{
3307	struct gen5_render_state *state = &sna->render_state.gen5;
3308	struct sna_static_stream general;
3309	struct gen5_wm_unit_state_padded *wm_state;
3310	uint32_t sf[2], wm[KERNEL_COUNT];
3311	int i, j, k, l, m;
3312
3313	sna_static_stream_init(&general);
3314
3315	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
3316	 * dumps, you know it points to zero.
3317	 */
3318	null_create(&general);
3319
3320	/* Set up the two SF states (one for blending with a mask, one without) */
3321	sf[0] = sna_static_stream_compile_sf(sna, &general, brw_sf_kernel__nomask);
3322	sf[1] = sna_static_stream_compile_sf(sna, &general, brw_sf_kernel__mask);
3323
3324	for (m = 0; m < KERNEL_COUNT; m++) {
3325		if (wm_kernels[m].size) {
3326			wm[m] = sna_static_stream_add(&general,
3327						      wm_kernels[m].data,
3328						      wm_kernels[m].size,
3329						      64);
3330		} else {
3331			wm[m] = sna_static_stream_compile_wm(sna, &general,
3332							     wm_kernels[m].data,
3333							     16);
3334		}
3335		assert(wm[m]);
3336	}
3337
3338	state->vs = gen5_create_vs_unit_state(&general);
3339
3340	state->sf[0] = gen5_create_sf_state(&general, sf[0]);
3341	state->sf[1] = gen5_create_sf_state(&general, sf[1]);
3342
3343
3344	/* Set up the WM states: each filter/extend type for source and mask, per
3345	 * kernel.
3346	 */
3347	wm_state = sna_static_stream_map(&general,
3348					  sizeof(*wm_state) * KERNEL_COUNT *
3349					  FILTER_COUNT * EXTEND_COUNT *
3350					  FILTER_COUNT * EXTEND_COUNT,
3351					  64);
3352	state->wm = sna_static_stream_offsetof(&general, wm_state);
3353	for (i = 0; i < FILTER_COUNT; i++) {
3354		for (j = 0; j < EXTEND_COUNT; j++) {
3355			for (k = 0; k < FILTER_COUNT; k++) {
3356				for (l = 0; l < EXTEND_COUNT; l++) {
3357					uint32_t sampler_state;
3358
3359					sampler_state =
3360						gen5_create_sampler_state(&general,
3361									  i, j,
3362									  k, l);
3363
3364					for (m = 0; m < KERNEL_COUNT; m++) {
3365						gen5_init_wm_state(&wm_state->state,
3366								   wm_kernels[m].has_mask,
3367								   wm[m], sampler_state);
3368						wm_state++;
3369					}
3370				}
3371			}
3372		}
3373	}
3374
3375	state->cc = gen5_create_cc_unit_state(&general);
3376
3377	state->general_bo = sna_static_stream_fini(sna, &general);
3378	return state->general_bo != NULL;
3379}
3380
3381const char *gen5_render_init(struct sna *sna, const char *backend)
3382{
3383	if (!gen5_render_setup(sna))
3384		return backend;
3385
3386	sna->kgem.context_switch = gen5_render_context_switch;
3387	sna->kgem.retire = gen4_render_retire;
3388	sna->kgem.expire = gen4_render_expire;
3389
3390#if !NO_COMPOSITE
3391	sna->render.composite = gen5_render_composite;
3392	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
3393#endif
3394#if !NO_COMPOSITE_SPANS
3395	sna->render.check_composite_spans = gen5_check_composite_spans;
3396	sna->render.composite_spans = gen5_render_composite_spans;
3397	if (intel_get_device_id(sna->dev) == 0x0044)
3398		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
3399#endif
3400	sna->render.video = gen5_render_video;
3401
3402	sna->render.copy_boxes = gen5_render_copy_boxes;
3403	sna->render.copy = gen5_render_copy;
3404
3405	sna->render.fill_boxes = gen5_render_fill_boxes;
3406	sna->render.fill = gen5_render_fill;
3407	sna->render.fill_one = gen5_render_fill_one;
3408
3409	sna->render.flush = gen4_render_flush;
3410	sna->render.reset = gen5_render_reset;
3411	sna->render.fini = gen5_render_fini;
3412
3413	sna->render.max_3d_size = MAX_3D_SIZE;
3414	sna->render.max_3d_pitch = 1 << 18;
3415	return "Ironlake (gen5)";
3416}
3417