gen5_render.c revision 03b705cf
1/*
2 * Copyright © 2006,2008,2011 Intel Corporation
3 * Copyright © 2007 Red Hat, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 *    Wang Zhenyu <zhenyu.z.wang@sna.com>
26 *    Eric Anholt <eric@anholt.net>
27 *    Carl Worth <cworth@redhat.com>
28 *    Keith Packard <keithp@keithp.com>
29 *    Chris Wilson <chris@chris-wilson.co.uk>
30 *
31 */
32
33#ifdef HAVE_CONFIG_H
34#include "config.h"
35#endif
36
37#include "sna.h"
38#include "sna_reg.h"
39#include "sna_render.h"
40#include "sna_render_inline.h"
41#include "sna_video.h"
42
43#include "brw/brw.h"
44#include "gen5_render.h"
45#include "gen4_source.h"
46#include "gen4_vertex.h"
47
48#define NO_COMPOSITE 0
49#define NO_COMPOSITE_SPANS 0
50
51#define PREFER_BLT_FILL 1
52
53#define DBG_NO_STATE_CACHE 0
54#define DBG_NO_SURFACE_CACHE 0
55
56#define MAX_3D_SIZE 8192
57
58#define GEN5_GRF_BLOCKS(nreg)    ((nreg + 15) / 16 - 1)
59
60/* Set up a default static partitioning of the URB, which is supposed to
61 * allow anything we would want to do, at potentially lower performance.
62 */
63#define URB_CS_ENTRY_SIZE     1
64#define URB_CS_ENTRIES	      0
65
66#define URB_VS_ENTRY_SIZE     1
67#define URB_VS_ENTRIES	      256 /* minimum of 8 */
68
69#define URB_GS_ENTRY_SIZE     0
70#define URB_GS_ENTRIES	      0
71
72#define URB_CLIP_ENTRY_SIZE   0
73#define URB_CLIP_ENTRIES      0
74
75#define URB_SF_ENTRY_SIZE     2
76#define URB_SF_ENTRIES	      64
77
78/*
79 * this program computes dA/dx and dA/dy for the texture coordinates along
80 * with the base texture coordinate. It was extracted from the Mesa driver
81 */
82
83#define SF_KERNEL_NUM_GRF  16
84#define SF_MAX_THREADS	   48
85
86#define PS_KERNEL_NUM_GRF   32
87#define PS_MAX_THREADS	    72
88
89static const uint32_t ps_kernel_packed_static[][4] = {
90#include "exa_wm_xy.g5b"
91#include "exa_wm_src_affine.g5b"
92#include "exa_wm_src_sample_argb.g5b"
93#include "exa_wm_yuv_rgb.g5b"
94#include "exa_wm_write.g5b"
95};
96
97static const uint32_t ps_kernel_planar_static[][4] = {
98#include "exa_wm_xy.g5b"
99#include "exa_wm_src_affine.g5b"
100#include "exa_wm_src_sample_planar.g5b"
101#include "exa_wm_yuv_rgb.g5b"
102#include "exa_wm_write.g5b"
103};
104
105#define NOKERNEL(kernel_enum, func, masked) \
106    [kernel_enum] = {func, 0, masked}
107#define KERNEL(kernel_enum, kernel, masked) \
108    [kernel_enum] = {&kernel, sizeof(kernel), masked}
109static const struct wm_kernel_info {
110	const void *data;
111	unsigned int size;
112	bool has_mask;
113} wm_kernels[] = {
114	NOKERNEL(WM_KERNEL, brw_wm_kernel__affine, false),
115	NOKERNEL(WM_KERNEL_P, brw_wm_kernel__projective, false),
116
117	NOKERNEL(WM_KERNEL_MASK, brw_wm_kernel__affine_mask, true),
118	NOKERNEL(WM_KERNEL_MASK_P, brw_wm_kernel__projective_mask, true),
119
120	NOKERNEL(WM_KERNEL_MASKCA, brw_wm_kernel__affine_mask_ca, true),
121	NOKERNEL(WM_KERNEL_MASKCA_P, brw_wm_kernel__projective_mask_ca, true),
122
123	NOKERNEL(WM_KERNEL_MASKSA, brw_wm_kernel__affine_mask_sa, true),
124	NOKERNEL(WM_KERNEL_MASKSA_P, brw_wm_kernel__projective_mask_sa, true),
125
126	NOKERNEL(WM_KERNEL_OPACITY, brw_wm_kernel__affine_opacity, true),
127	NOKERNEL(WM_KERNEL_OPACITY_P, brw_wm_kernel__projective_opacity, true),
128
129	KERNEL(WM_KERNEL_VIDEO_PLANAR, ps_kernel_planar_static, false),
130	KERNEL(WM_KERNEL_VIDEO_PACKED, ps_kernel_packed_static, false),
131};
132#undef KERNEL
133
134static const struct blendinfo {
135	bool src_alpha;
136	uint32_t src_blend;
137	uint32_t dst_blend;
138} gen5_blend_op[] = {
139	/* Clear */	{0, GEN5_BLENDFACTOR_ZERO, GEN5_BLENDFACTOR_ZERO},
140	/* Src */	{0, GEN5_BLENDFACTOR_ONE, GEN5_BLENDFACTOR_ZERO},
141	/* Dst */	{0, GEN5_BLENDFACTOR_ZERO, GEN5_BLENDFACTOR_ONE},
142	/* Over */	{1, GEN5_BLENDFACTOR_ONE, GEN5_BLENDFACTOR_INV_SRC_ALPHA},
143	/* OverReverse */ {0, GEN5_BLENDFACTOR_INV_DST_ALPHA, GEN5_BLENDFACTOR_ONE},
144	/* In */	{0, GEN5_BLENDFACTOR_DST_ALPHA, GEN5_BLENDFACTOR_ZERO},
145	/* InReverse */	{1, GEN5_BLENDFACTOR_ZERO, GEN5_BLENDFACTOR_SRC_ALPHA},
146	/* Out */	{0, GEN5_BLENDFACTOR_INV_DST_ALPHA, GEN5_BLENDFACTOR_ZERO},
147	/* OutReverse */ {1, GEN5_BLENDFACTOR_ZERO, GEN5_BLENDFACTOR_INV_SRC_ALPHA},
148	/* Atop */	{1, GEN5_BLENDFACTOR_DST_ALPHA, GEN5_BLENDFACTOR_INV_SRC_ALPHA},
149	/* AtopReverse */ {1, GEN5_BLENDFACTOR_INV_DST_ALPHA, GEN5_BLENDFACTOR_SRC_ALPHA},
150	/* Xor */	{1, GEN5_BLENDFACTOR_INV_DST_ALPHA, GEN5_BLENDFACTOR_INV_SRC_ALPHA},
151	/* Add */	{0, GEN5_BLENDFACTOR_ONE, GEN5_BLENDFACTOR_ONE},
152};
153
154/**
155 * Highest-valued BLENDFACTOR used in gen5_blend_op.
156 *
157 * This leaves out GEN5_BLENDFACTOR_INV_DST_COLOR,
158 * GEN5_BLENDFACTOR_INV_CONST_{COLOR,ALPHA},
159 * GEN5_BLENDFACTOR_INV_SRC1_{COLOR,ALPHA}
160 */
161#define GEN5_BLENDFACTOR_COUNT (GEN5_BLENDFACTOR_INV_DST_ALPHA + 1)
162
163#define BLEND_OFFSET(s, d) \
164	(((s) * GEN5_BLENDFACTOR_COUNT + (d)) * 64)
165
166#define SAMPLER_OFFSET(sf, se, mf, me, k) \
167	((((((sf) * EXTEND_COUNT + (se)) * FILTER_COUNT + (mf)) * EXTEND_COUNT + (me)) * KERNEL_COUNT + (k)) * 64)
168
169static bool
170gen5_emit_pipelined_pointers(struct sna *sna,
171			     const struct sna_composite_op *op,
172			     int blend, int kernel);
173
174#define OUT_BATCH(v) batch_emit(sna, v)
175#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
176#define OUT_VERTEX_F(v) vertex_emit(sna, v)
177
178static inline bool too_large(int width, int height)
179{
180	return width > MAX_3D_SIZE || height > MAX_3D_SIZE;
181}
182
183static int
184gen5_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
185{
186	int base;
187
188	if (has_mask) {
189		if (is_ca) {
190			if (gen5_blend_op[op].src_alpha)
191				base = WM_KERNEL_MASKSA;
192			else
193				base = WM_KERNEL_MASKCA;
194		} else
195			base = WM_KERNEL_MASK;
196	} else
197		base = WM_KERNEL;
198
199	return base + !is_affine;
200}
201
202static bool gen5_magic_ca_pass(struct sna *sna,
203			       const struct sna_composite_op *op)
204{
205	struct gen5_render_state *state = &sna->render_state.gen5;
206
207	if (!op->need_magic_ca_pass)
208		return false;
209
210	assert(sna->render.vertex_index > sna->render.vertex_start);
211
212	DBG(("%s: CA fixup\n", __FUNCTION__));
213	assert(op->mask.bo != NULL);
214	assert(op->has_component_alpha);
215
216	gen5_emit_pipelined_pointers
217		(sna, op, PictOpAdd,
218		 gen5_choose_composite_kernel(PictOpAdd,
219					      true, true, op->is_affine));
220
221	OUT_BATCH(GEN5_3DPRIMITIVE |
222		  GEN5_3DPRIMITIVE_VERTEX_SEQUENTIAL |
223		  (_3DPRIM_RECTLIST << GEN5_3DPRIMITIVE_TOPOLOGY_SHIFT) |
224		  (0 << 9) |
225		  4);
226	OUT_BATCH(sna->render.vertex_index - sna->render.vertex_start);
227	OUT_BATCH(sna->render.vertex_start);
228	OUT_BATCH(1);	/* single instance */
229	OUT_BATCH(0);	/* start instance location */
230	OUT_BATCH(0);	/* index buffer offset, ignored */
231
232	state->last_primitive = sna->kgem.nbatch;
233	return true;
234}
235
236static uint32_t gen5_get_blend(int op,
237			       bool has_component_alpha,
238			       uint32_t dst_format)
239{
240	uint32_t src, dst;
241
242	src = gen5_blend_op[op].src_blend;
243	dst = gen5_blend_op[op].dst_blend;
244
245	/* If there's no dst alpha channel, adjust the blend op so that we'll treat
246	 * it as always 1.
247	 */
248	if (PICT_FORMAT_A(dst_format) == 0) {
249		if (src == GEN5_BLENDFACTOR_DST_ALPHA)
250			src = GEN5_BLENDFACTOR_ONE;
251		else if (src == GEN5_BLENDFACTOR_INV_DST_ALPHA)
252			src = GEN5_BLENDFACTOR_ZERO;
253	}
254
255	/* If the source alpha is being used, then we should only be in a
256	 * case where the source blend factor is 0, and the source blend
257	 * value is the mask channels multiplied by the source picture's alpha.
258	 */
259	if (has_component_alpha && gen5_blend_op[op].src_alpha) {
260		if (dst == GEN5_BLENDFACTOR_SRC_ALPHA)
261			dst = GEN5_BLENDFACTOR_SRC_COLOR;
262		else if (dst == GEN5_BLENDFACTOR_INV_SRC_ALPHA)
263			dst = GEN5_BLENDFACTOR_INV_SRC_COLOR;
264	}
265
266	DBG(("blend op=%d, dst=%x [A=%d] => src=%d, dst=%d => offset=%x\n",
267	     op, dst_format, PICT_FORMAT_A(dst_format),
268	     src, dst, BLEND_OFFSET(src, dst)));
269	return BLEND_OFFSET(src, dst);
270}
271
272static uint32_t gen5_get_card_format(PictFormat format)
273{
274	switch (format) {
275	default:
276		return -1;
277	case PICT_a8r8g8b8:
278		return GEN5_SURFACEFORMAT_B8G8R8A8_UNORM;
279	case PICT_x8r8g8b8:
280		return GEN5_SURFACEFORMAT_B8G8R8X8_UNORM;
281	case PICT_a8b8g8r8:
282		return GEN5_SURFACEFORMAT_R8G8B8A8_UNORM;
283	case PICT_x8b8g8r8:
284		return GEN5_SURFACEFORMAT_R8G8B8X8_UNORM;
285	case PICT_a2r10g10b10:
286		return GEN5_SURFACEFORMAT_B10G10R10A2_UNORM;
287	case PICT_x2r10g10b10:
288		return GEN5_SURFACEFORMAT_B10G10R10X2_UNORM;
289	case PICT_r8g8b8:
290		return GEN5_SURFACEFORMAT_R8G8B8_UNORM;
291	case PICT_r5g6b5:
292		return GEN5_SURFACEFORMAT_B5G6R5_UNORM;
293	case PICT_a1r5g5b5:
294		return GEN5_SURFACEFORMAT_B5G5R5A1_UNORM;
295	case PICT_a8:
296		return GEN5_SURFACEFORMAT_A8_UNORM;
297	case PICT_a4r4g4b4:
298		return GEN5_SURFACEFORMAT_B4G4R4A4_UNORM;
299	}
300}
301
302static uint32_t gen5_get_dest_format(PictFormat format)
303{
304	switch (format) {
305	default:
306		return -1;
307	case PICT_a8r8g8b8:
308	case PICT_x8r8g8b8:
309		return GEN5_SURFACEFORMAT_B8G8R8A8_UNORM;
310	case PICT_a8b8g8r8:
311	case PICT_x8b8g8r8:
312		return GEN5_SURFACEFORMAT_R8G8B8A8_UNORM;
313	case PICT_a2r10g10b10:
314	case PICT_x2r10g10b10:
315		return GEN5_SURFACEFORMAT_B10G10R10A2_UNORM;
316	case PICT_r5g6b5:
317		return GEN5_SURFACEFORMAT_B5G6R5_UNORM;
318	case PICT_x1r5g5b5:
319	case PICT_a1r5g5b5:
320		return GEN5_SURFACEFORMAT_B5G5R5A1_UNORM;
321	case PICT_a8:
322		return GEN5_SURFACEFORMAT_A8_UNORM;
323	case PICT_a4r4g4b4:
324	case PICT_x4r4g4b4:
325		return GEN5_SURFACEFORMAT_B4G4R4A4_UNORM;
326	}
327}
328
329static bool gen5_check_dst_format(PictFormat format)
330{
331	if (gen5_get_dest_format(format) != -1)
332		return true;
333
334	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
335	return false;
336}
337
338static bool gen5_check_format(uint32_t format)
339{
340	if (gen5_get_card_format(format) != -1)
341		return true;
342
343	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
344	return false;
345}
346
347typedef struct gen5_surface_state_padded {
348	struct gen5_surface_state state;
349	char pad[32 - sizeof(struct gen5_surface_state)];
350} gen5_surface_state_padded;
351
352static void null_create(struct sna_static_stream *stream)
353{
354	/* A bunch of zeros useful for legacy border color and depth-stencil */
355	sna_static_stream_map(stream, 64, 64);
356}
357
358static void
359sampler_state_init(struct gen5_sampler_state *sampler_state,
360		   sampler_filter_t filter,
361		   sampler_extend_t extend)
362{
363	sampler_state->ss0.lod_preclamp = 1;	/* GL mode */
364
365	/* We use the legacy mode to get the semantics specified by
366	 * the Render extension. */
367	sampler_state->ss0.border_color_mode = GEN5_BORDER_COLOR_MODE_LEGACY;
368
369	switch (filter) {
370	default:
371	case SAMPLER_FILTER_NEAREST:
372		sampler_state->ss0.min_filter = GEN5_MAPFILTER_NEAREST;
373		sampler_state->ss0.mag_filter = GEN5_MAPFILTER_NEAREST;
374		break;
375	case SAMPLER_FILTER_BILINEAR:
376		sampler_state->ss0.min_filter = GEN5_MAPFILTER_LINEAR;
377		sampler_state->ss0.mag_filter = GEN5_MAPFILTER_LINEAR;
378		break;
379	}
380
381	switch (extend) {
382	default:
383	case SAMPLER_EXTEND_NONE:
384		sampler_state->ss1.r_wrap_mode = GEN5_TEXCOORDMODE_CLAMP_BORDER;
385		sampler_state->ss1.s_wrap_mode = GEN5_TEXCOORDMODE_CLAMP_BORDER;
386		sampler_state->ss1.t_wrap_mode = GEN5_TEXCOORDMODE_CLAMP_BORDER;
387		break;
388	case SAMPLER_EXTEND_REPEAT:
389		sampler_state->ss1.r_wrap_mode = GEN5_TEXCOORDMODE_WRAP;
390		sampler_state->ss1.s_wrap_mode = GEN5_TEXCOORDMODE_WRAP;
391		sampler_state->ss1.t_wrap_mode = GEN5_TEXCOORDMODE_WRAP;
392		break;
393	case SAMPLER_EXTEND_PAD:
394		sampler_state->ss1.r_wrap_mode = GEN5_TEXCOORDMODE_CLAMP;
395		sampler_state->ss1.s_wrap_mode = GEN5_TEXCOORDMODE_CLAMP;
396		sampler_state->ss1.t_wrap_mode = GEN5_TEXCOORDMODE_CLAMP;
397		break;
398	case SAMPLER_EXTEND_REFLECT:
399		sampler_state->ss1.r_wrap_mode = GEN5_TEXCOORDMODE_MIRROR;
400		sampler_state->ss1.s_wrap_mode = GEN5_TEXCOORDMODE_MIRROR;
401		sampler_state->ss1.t_wrap_mode = GEN5_TEXCOORDMODE_MIRROR;
402		break;
403	}
404}
405
406static uint32_t gen5_filter(uint32_t filter)
407{
408	switch (filter) {
409	default:
410		assert(0);
411	case PictFilterNearest:
412		return SAMPLER_FILTER_NEAREST;
413	case PictFilterBilinear:
414		return SAMPLER_FILTER_BILINEAR;
415	}
416}
417
418static uint32_t gen5_check_filter(PicturePtr picture)
419{
420	switch (picture->filter) {
421	case PictFilterNearest:
422	case PictFilterBilinear:
423		return true;
424	default:
425		DBG(("%s: unknown filter: %x\n", __FUNCTION__, picture->filter));
426		return false;
427	}
428}
429
430static uint32_t gen5_repeat(uint32_t repeat)
431{
432	switch (repeat) {
433	default:
434		assert(0);
435	case RepeatNone:
436		return SAMPLER_EXTEND_NONE;
437	case RepeatNormal:
438		return SAMPLER_EXTEND_REPEAT;
439	case RepeatPad:
440		return SAMPLER_EXTEND_PAD;
441	case RepeatReflect:
442		return SAMPLER_EXTEND_REFLECT;
443	}
444}
445
446static bool gen5_check_repeat(PicturePtr picture)
447{
448	if (!picture->repeat)
449		return true;
450
451	switch (picture->repeatType) {
452	case RepeatNone:
453	case RepeatNormal:
454	case RepeatPad:
455	case RepeatReflect:
456		return true;
457	default:
458		DBG(("%s: unknown repeat: %x\n",
459		     __FUNCTION__, picture->repeatType));
460		return false;
461	}
462}
463
464static uint32_t
465gen5_tiling_bits(uint32_t tiling)
466{
467	switch (tiling) {
468	default: assert(0);
469	case I915_TILING_NONE: return 0;
470	case I915_TILING_X: return GEN5_SURFACE_TILED;
471	case I915_TILING_Y: return GEN5_SURFACE_TILED | GEN5_SURFACE_TILED_Y;
472	}
473}
474
475/**
476 * Sets up the common fields for a surface state buffer for the given
477 * picture in the given surface state buffer.
478 */
479static uint32_t
480gen5_bind_bo(struct sna *sna,
481	     struct kgem_bo *bo,
482	     uint32_t width,
483	     uint32_t height,
484	     uint32_t format,
485	     bool is_dst)
486{
487	uint32_t domains;
488	uint16_t offset;
489	uint32_t *ss;
490
491	/* After the first bind, we manage the cache domains within the batch */
492	if (!DBG_NO_SURFACE_CACHE) {
493		offset = kgem_bo_get_binding(bo, format | is_dst << 31);
494		if (offset) {
495			if (is_dst)
496				kgem_bo_mark_dirty(bo);
497			return offset * sizeof(uint32_t);
498		}
499	}
500
501	offset = sna->kgem.surface -=
502		sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
503	ss = sna->kgem.batch + offset;
504
505	ss[0] = (GEN5_SURFACE_2D << GEN5_SURFACE_TYPE_SHIFT |
506		 GEN5_SURFACE_BLEND_ENABLED |
507		 format << GEN5_SURFACE_FORMAT_SHIFT);
508
509	if (is_dst) {
510		ss[0] |= GEN5_SURFACE_RC_READ_WRITE;
511		domains = I915_GEM_DOMAIN_RENDER << 16 | I915_GEM_DOMAIN_RENDER;
512	} else
513		domains = I915_GEM_DOMAIN_SAMPLER << 16;
514	ss[1] = kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0);
515
516	ss[2] = ((width - 1)  << GEN5_SURFACE_WIDTH_SHIFT |
517		 (height - 1) << GEN5_SURFACE_HEIGHT_SHIFT);
518	ss[3] = (gen5_tiling_bits(bo->tiling) |
519		 (bo->pitch - 1) << GEN5_SURFACE_PITCH_SHIFT);
520	ss[4] = 0;
521	ss[5] = 0;
522
523	kgem_bo_set_binding(bo, format | is_dst << 31, offset);
524
525	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
526	     offset, bo->handle, ss[1],
527	     format, width, height, bo->pitch, bo->tiling,
528	     domains & 0xffff ? "render" : "sampler"));
529
530	return offset * sizeof(uint32_t);
531}
532
533static void gen5_emit_vertex_buffer(struct sna *sna,
534				    const struct sna_composite_op *op)
535{
536	int id = op->u.gen5.ve_id;
537
538	assert((sna->render.vb_id & (1 << id)) == 0);
539
540	OUT_BATCH(GEN5_3DSTATE_VERTEX_BUFFERS | 3);
541	OUT_BATCH(id << VB0_BUFFER_INDEX_SHIFT | VB0_VERTEXDATA |
542		  (4*op->floats_per_vertex << VB0_BUFFER_PITCH_SHIFT));
543	assert(sna->render.nvertex_reloc < ARRAY_SIZE(sna->render.vertex_reloc));
544	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
545	OUT_BATCH(0);
546	OUT_BATCH(~0); /* max address: disabled */
547	OUT_BATCH(0);
548
549	sna->render.vb_id |= 1 << id;
550}
551
552static void gen5_emit_primitive(struct sna *sna)
553{
554	if (sna->kgem.nbatch == sna->render_state.gen5.last_primitive) {
555		sna->render.vertex_offset = sna->kgem.nbatch - 5;
556		return;
557	}
558
559	OUT_BATCH(GEN5_3DPRIMITIVE |
560		  GEN5_3DPRIMITIVE_VERTEX_SEQUENTIAL |
561		  (_3DPRIM_RECTLIST << GEN5_3DPRIMITIVE_TOPOLOGY_SHIFT) |
562		  (0 << 9) |
563		  4);
564	sna->render.vertex_offset = sna->kgem.nbatch;
565	OUT_BATCH(0);	/* vertex count, to be filled in later */
566	OUT_BATCH(sna->render.vertex_index);
567	OUT_BATCH(1);	/* single instance */
568	OUT_BATCH(0);	/* start instance location */
569	OUT_BATCH(0);	/* index buffer offset, ignored */
570	sna->render.vertex_start = sna->render.vertex_index;
571
572	sna->render_state.gen5.last_primitive = sna->kgem.nbatch;
573}
574
575static bool gen5_rectangle_begin(struct sna *sna,
576				 const struct sna_composite_op *op)
577{
578	int id = op->u.gen5.ve_id;
579	int ndwords;
580
581	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
582		return true;
583
584	ndwords = op->need_magic_ca_pass ? 20 : 6;
585	if ((sna->render.vb_id & (1 << id)) == 0)
586		ndwords += 5;
587
588	if (!kgem_check_batch(&sna->kgem, ndwords))
589		return false;
590
591	if ((sna->render.vb_id & (1 << id)) == 0)
592		gen5_emit_vertex_buffer(sna, op);
593	if (sna->render.vertex_offset == 0)
594		gen5_emit_primitive(sna);
595
596	return true;
597}
598
599static int gen5_get_rectangles__flush(struct sna *sna,
600				      const struct sna_composite_op *op)
601{
602	/* Preventing discarding new vbo after lock contention */
603	if (sna_vertex_wait__locked(&sna->render)) {
604		int rem = vertex_space(sna);
605		if (rem > op->floats_per_rect)
606			return rem;
607	}
608
609	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 20 : 6))
610		return 0;
611	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
612		return 0;
613
614	if (sna->render.vertex_offset) {
615		gen4_vertex_flush(sna);
616		if (gen5_magic_ca_pass(sna, op))
617			gen5_emit_pipelined_pointers(sna, op, op->op,
618						     op->u.gen5.wm_kernel);
619	}
620
621	return gen4_vertex_finish(sna);
622}
623
624inline static int gen5_get_rectangles(struct sna *sna,
625				      const struct sna_composite_op *op,
626				      int want,
627				      void (*emit_state)(struct sna *sna,
628							 const struct sna_composite_op *op))
629{
630	int rem;
631
632	assert(want);
633
634start:
635	rem = vertex_space(sna);
636	if (unlikely(rem < op->floats_per_rect)) {
637		DBG(("flushing vbo for %s: %d < %d\n",
638		     __FUNCTION__, rem, op->floats_per_rect));
639		rem = gen5_get_rectangles__flush(sna, op);
640		if (unlikely (rem == 0))
641			goto flush;
642	}
643
644	if (unlikely(sna->render.vertex_offset == 0)) {
645		if (!gen5_rectangle_begin(sna, op))
646			goto flush;
647		else
648			goto start;
649	}
650
651	assert(rem <= vertex_space(sna));
652	assert(op->floats_per_rect <= rem);
653	if (want > 1 && want * op->floats_per_rect > rem)
654		want = rem / op->floats_per_rect;
655
656	sna->render.vertex_index += 3*want;
657	return want;
658
659flush:
660	if (sna->render.vertex_offset) {
661		gen4_vertex_flush(sna);
662		gen5_magic_ca_pass(sna, op);
663	}
664	sna_vertex_wait__locked(&sna->render);
665	_kgem_submit(&sna->kgem);
666	emit_state(sna, op);
667	goto start;
668}
669
670static uint32_t *
671gen5_composite_get_binding_table(struct sna *sna,
672				 uint16_t *offset)
673{
674	sna->kgem.surface -=
675		sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
676
677	DBG(("%s(%x)\n", __FUNCTION__, 4*sna->kgem.surface));
678
679	/* Clear all surplus entries to zero in case of prefetch */
680	*offset = sna->kgem.surface;
681	return memset(sna->kgem.batch + sna->kgem.surface,
682		      0, sizeof(struct gen5_surface_state_padded));
683}
684
685static void
686gen5_emit_urb(struct sna *sna)
687{
688	int urb_vs_start, urb_vs_size;
689	int urb_gs_start, urb_gs_size;
690	int urb_clip_start, urb_clip_size;
691	int urb_sf_start, urb_sf_size;
692	int urb_cs_start, urb_cs_size;
693
694	urb_vs_start = 0;
695	urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
696	urb_gs_start = urb_vs_start + urb_vs_size;
697	urb_gs_size = URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
698	urb_clip_start = urb_gs_start + urb_gs_size;
699	urb_clip_size = URB_CLIP_ENTRIES * URB_CLIP_ENTRY_SIZE;
700	urb_sf_start = urb_clip_start + urb_clip_size;
701	urb_sf_size = URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
702	urb_cs_start = urb_sf_start + urb_sf_size;
703	urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
704
705	OUT_BATCH(GEN5_URB_FENCE |
706		  UF0_CS_REALLOC |
707		  UF0_SF_REALLOC |
708		  UF0_CLIP_REALLOC |
709		  UF0_GS_REALLOC |
710		  UF0_VS_REALLOC |
711		  1);
712	OUT_BATCH(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
713		  ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
714		  ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
715	OUT_BATCH(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
716		  ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
717
718	/* Constant buffer state */
719	OUT_BATCH(GEN5_CS_URB_STATE | 0);
720	OUT_BATCH((URB_CS_ENTRY_SIZE - 1) << 4 | URB_CS_ENTRIES << 0);
721}
722
723static void
724gen5_emit_state_base_address(struct sna *sna)
725{
726	assert(sna->render_state.gen5.general_bo->proxy == NULL);
727	OUT_BATCH(GEN5_STATE_BASE_ADDRESS | 6);
728	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* general */
729				 sna->kgem.nbatch,
730				 sna->render_state.gen5.general_bo,
731				 I915_GEM_DOMAIN_INSTRUCTION << 16,
732				 BASE_ADDRESS_MODIFY));
733	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* surface */
734				 sna->kgem.nbatch,
735				 NULL,
736				 I915_GEM_DOMAIN_INSTRUCTION << 16,
737				 BASE_ADDRESS_MODIFY));
738	OUT_BATCH(0); /* media */
739	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* instruction */
740				 sna->kgem.nbatch,
741				 sna->render_state.gen5.general_bo,
742				 I915_GEM_DOMAIN_INSTRUCTION << 16,
743				 BASE_ADDRESS_MODIFY));
744
745	/* upper bounds, all disabled */
746	OUT_BATCH(BASE_ADDRESS_MODIFY);
747	OUT_BATCH(0);
748	OUT_BATCH(BASE_ADDRESS_MODIFY);
749}
750
751static void
752gen5_emit_invariant(struct sna *sna)
753{
754	/* Ironlake errata workaround: Before disabling the clipper,
755	 * you have to MI_FLUSH to get the pipeline idle.
756	 *
757	 * However, the kernel flushes the pipeline between batches,
758	 * so we should be safe....
759	 *
760	 * On the other hand, after using BLT we must use a non-pipelined
761	 * operation...
762	 */
763	if (sna->kgem.nreloc)
764		OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
765
766	OUT_BATCH(GEN5_PIPELINE_SELECT | PIPELINE_SELECT_3D);
767
768	gen5_emit_state_base_address(sna);
769
770	sna->render_state.gen5.needs_invariant = false;
771}
772
773static void
774gen5_get_batch(struct sna *sna, const struct sna_composite_op *op)
775{
776	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
777
778	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 4)) {
779		DBG(("%s: flushing batch: %d < %d+%d\n",
780		     __FUNCTION__, sna->kgem.surface - sna->kgem.nbatch,
781		     150, 4*8));
782		kgem_submit(&sna->kgem);
783		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
784	}
785
786	if (sna->render_state.gen5.needs_invariant)
787		gen5_emit_invariant(sna);
788}
789
790static void
791gen5_align_vertex(struct sna *sna, const struct sna_composite_op *op)
792{
793	assert(op->floats_per_rect == 3*op->floats_per_vertex);
794	if (op->floats_per_vertex != sna->render_state.gen5.floats_per_vertex) {
795		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
796			gen4_vertex_finish(sna);
797
798		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
799		     sna->render_state.gen5.floats_per_vertex,
800		     op->floats_per_vertex,
801		     sna->render.vertex_index,
802		     (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex));
803		sna->render.vertex_index = (sna->render.vertex_used + op->floats_per_vertex - 1) / op->floats_per_vertex;
804		sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
805		sna->render_state.gen5.floats_per_vertex = op->floats_per_vertex;
806	}
807}
808
809static void
810gen5_emit_binding_table(struct sna *sna, uint16_t offset)
811{
812	if (!DBG_NO_STATE_CACHE &&
813	    sna->render_state.gen5.surface_table == offset)
814		return;
815
816	sna->render_state.gen5.surface_table = offset;
817
818	/* Binding table pointers */
819	OUT_BATCH(GEN5_3DSTATE_BINDING_TABLE_POINTERS | 4);
820	OUT_BATCH(0);		/* vs */
821	OUT_BATCH(0);		/* gs */
822	OUT_BATCH(0);		/* clip */
823	OUT_BATCH(0);		/* sf */
824	/* Only the PS uses the binding table */
825	OUT_BATCH(offset*4);
826}
827
828static bool
829gen5_emit_pipelined_pointers(struct sna *sna,
830			     const struct sna_composite_op *op,
831			     int blend, int kernel)
832{
833	uint16_t sp, bp;
834	uint32_t key;
835
836	DBG(("%s: has_mask=%d, src=(%d, %d), mask=(%d, %d),kernel=%d, blend=%d, ca=%d, format=%x\n",
837	     __FUNCTION__, op->u.gen5.ve_id & 2,
838	     op->src.filter, op->src.repeat,
839	     op->mask.filter, op->mask.repeat,
840	     kernel, blend, op->has_component_alpha, (int)op->dst.format));
841
842	sp = SAMPLER_OFFSET(op->src.filter, op->src.repeat,
843			    op->mask.filter, op->mask.repeat,
844			    kernel);
845	bp = gen5_get_blend(blend, op->has_component_alpha, op->dst.format);
846
847	key = sp | (uint32_t)bp << 16 | (op->mask.bo != NULL) << 31;
848	DBG(("%s: sp=%d, bp=%d, key=%08x (current sp=%d, bp=%d, key=%08x)\n",
849	     __FUNCTION__, sp, bp, key,
850	     sna->render_state.gen5.last_pipelined_pointers & 0xffff,
851	     (sna->render_state.gen5.last_pipelined_pointers >> 16) & 0x7fff,
852	     sna->render_state.gen5.last_pipelined_pointers));
853	if (key == sna->render_state.gen5.last_pipelined_pointers)
854		return false;
855
856	OUT_BATCH(GEN5_3DSTATE_PIPELINED_POINTERS | 5);
857	OUT_BATCH(sna->render_state.gen5.vs);
858	OUT_BATCH(GEN5_GS_DISABLE); /* passthrough */
859	OUT_BATCH(GEN5_CLIP_DISABLE); /* passthrough */
860	OUT_BATCH(sna->render_state.gen5.sf[op->mask.bo != NULL]);
861	OUT_BATCH(sna->render_state.gen5.wm + sp);
862	OUT_BATCH(sna->render_state.gen5.cc + bp);
863
864	bp = (sna->render_state.gen5.last_pipelined_pointers & 0x7fff0000) != ((uint32_t)bp << 16);
865	sna->render_state.gen5.last_pipelined_pointers = key;
866
867	gen5_emit_urb(sna);
868
869	return bp;
870}
871
872static bool
873gen5_emit_drawing_rectangle(struct sna *sna, const struct sna_composite_op *op)
874{
875	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
876	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
877
878	assert(!too_large(op->dst.x, op->dst.y));
879	assert(!too_large(op->dst.width, op->dst.height));
880
881	if (!DBG_NO_STATE_CACHE &&
882	    sna->render_state.gen5.drawrect_limit == limit &&
883	    sna->render_state.gen5.drawrect_offset == offset)
884		return false;
885
886	sna->render_state.gen5.drawrect_offset = offset;
887	sna->render_state.gen5.drawrect_limit = limit;
888
889	OUT_BATCH(GEN5_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
890	OUT_BATCH(0x00000000);
891	OUT_BATCH(limit);
892	OUT_BATCH(offset);
893	return true;
894}
895
896static void
897gen5_emit_vertex_elements(struct sna *sna,
898			  const struct sna_composite_op *op)
899{
900	/*
901	 * vertex data in vertex buffer
902	 *    position: (x, y)
903	 *    texture coordinate 0: (u0, v0) if (is_affine is true) else (u0, v0, w0)
904	 *    texture coordinate 1 if (has_mask is true): same as above
905	 */
906	struct gen5_render_state *render = &sna->render_state.gen5;
907	int id = op->u.gen5.ve_id;
908	bool has_mask = id >> 2;
909	uint32_t format, dw;
910
911	if (!DBG_NO_STATE_CACHE && render->ve_id == id)
912		return;
913
914	DBG(("%s: changing %d -> %d\n", __FUNCTION__, render->ve_id, id));
915	render->ve_id = id;
916
917	/* The VUE layout
918	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
919	 *    dword 4-7: position (x, y, 1.0, 1.0),
920	 *    dword 8-11: texture coordinate 0 (u0, v0, w0, 1.0)
921	 *    dword 12-15: texture coordinate 1 (u1, v1, w1, 1.0)
922	 *
923	 * dword 4-15 are fetched from vertex buffer
924	 */
925	OUT_BATCH(GEN5_3DSTATE_VERTEX_ELEMENTS |
926		((2 * (has_mask ? 4 : 3)) + 1 - 2));
927
928	OUT_BATCH((id << VE0_VERTEX_BUFFER_INDEX_SHIFT) | VE0_VALID |
929		  (GEN5_SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT) |
930		  (0 << VE0_OFFSET_SHIFT));
931	OUT_BATCH((VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT) |
932		  (VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT) |
933		  (VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT) |
934		  (VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT));
935
936	/* x,y */
937	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
938		  GEN5_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
939		  0 << VE0_OFFSET_SHIFT);
940	OUT_BATCH(VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
941		  VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
942		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
943		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
944
945	/* u0, v0, w0 */
946	DBG(("%s: id=%d, first channel %d floats, offset=4b\n", __FUNCTION__,
947	     id, id & 3));
948	dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
949	switch (id & 3) {
950	default:
951		assert(0);
952	case 0:
953		format = GEN5_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT;
954		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
955		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
956		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
957		break;
958	case 1:
959		format = GEN5_SURFACEFORMAT_R32_FLOAT << VE0_FORMAT_SHIFT;
960		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
961		dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
962		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
963		break;
964	case 2:
965		format = GEN5_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT;
966		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
967		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
968		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
969		break;
970	case 3:
971		format = GEN5_SURFACEFORMAT_R32G32B32_FLOAT << VE0_FORMAT_SHIFT;
972		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
973		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
974		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
975		break;
976	}
977	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
978		  format | 4 << VE0_OFFSET_SHIFT);
979	OUT_BATCH(dw);
980
981	/* u1, v1, w1 */
982	if (has_mask) {
983		unsigned offset = 4 + ((id & 3) ?: 1) * sizeof(float);
984		DBG(("%s: id=%x, second channel %d floats, offset=%db\n", __FUNCTION__,
985		     id, id >> 2, offset));
986		dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
987		switch (id >> 2) {
988		case 1:
989			format = GEN5_SURFACEFORMAT_R32_FLOAT << VE0_FORMAT_SHIFT;
990			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
991			dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
992			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
993			break;
994		default:
995			assert(0);
996		case 2:
997			format = GEN5_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT;
998			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
999			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1000			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1001			break;
1002		case 3:
1003			format = GEN5_SURFACEFORMAT_R32G32B32_FLOAT << VE0_FORMAT_SHIFT;
1004			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1005			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1006			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
1007			break;
1008		}
1009		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1010			  format | offset << VE0_OFFSET_SHIFT);
1011		OUT_BATCH(dw);
1012	}
1013}
1014
1015inline static void
1016gen5_emit_pipe_flush(struct sna *sna)
1017{
1018	OUT_BATCH(GEN5_PIPE_CONTROL | (4 - 2));
1019	OUT_BATCH(GEN5_PIPE_CONTROL_WC_FLUSH);
1020	OUT_BATCH(0);
1021	OUT_BATCH(0);
1022}
1023
1024static void
1025gen5_emit_state(struct sna *sna,
1026		const struct sna_composite_op *op,
1027		uint16_t offset)
1028{
1029	bool flush = false;
1030
1031	assert(op->dst.bo->exec);
1032
1033	/* drawrect must be first for Ironlake BLT workaround */
1034	if (gen5_emit_drawing_rectangle(sna, op))
1035		offset &= ~1;
1036	gen5_emit_binding_table(sna, offset & ~1);
1037	if (gen5_emit_pipelined_pointers(sna, op, op->op, op->u.gen5.wm_kernel)){
1038		DBG(("%s: changed blend state, flush required? %d\n",
1039		     __FUNCTION__, (offset & 1) && op->op > PictOpSrc));
1040		flush = (offset & 1) && op->op > PictOpSrc;
1041	}
1042	gen5_emit_vertex_elements(sna, op);
1043
1044	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
1045		DBG(("%s: flushing dirty (%d, %d)\n", __FUNCTION__,
1046		     kgem_bo_is_dirty(op->src.bo),
1047		     kgem_bo_is_dirty(op->mask.bo)));
1048		OUT_BATCH(MI_FLUSH);
1049		kgem_clear_dirty(&sna->kgem);
1050		kgem_bo_mark_dirty(op->dst.bo);
1051		flush = false;
1052	}
1053	if (flush) {
1054		DBG(("%s: forcing flush\n", __FUNCTION__));
1055		gen5_emit_pipe_flush(sna);
1056	}
1057}
1058
1059static void gen5_bind_surfaces(struct sna *sna,
1060			       const struct sna_composite_op *op)
1061{
1062	bool dirty = kgem_bo_is_dirty(op->dst.bo);
1063	uint32_t *binding_table;
1064	uint16_t offset;
1065
1066	gen5_get_batch(sna, op);
1067
1068	binding_table = gen5_composite_get_binding_table(sna, &offset);
1069
1070	binding_table[0] =
1071		gen5_bind_bo(sna,
1072			    op->dst.bo, op->dst.width, op->dst.height,
1073			    gen5_get_dest_format(op->dst.format),
1074			    true);
1075	binding_table[1] =
1076		gen5_bind_bo(sna,
1077			     op->src.bo, op->src.width, op->src.height,
1078			     op->src.card_format,
1079			     false);
1080	if (op->mask.bo) {
1081		assert(op->u.gen5.ve_id >> 2);
1082		binding_table[2] =
1083			gen5_bind_bo(sna,
1084				     op->mask.bo,
1085				     op->mask.width,
1086				     op->mask.height,
1087				     op->mask.card_format,
1088				     false);
1089	}
1090
1091	if (sna->kgem.surface == offset &&
1092	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen5.surface_table) == *(uint64_t*)binding_table &&
1093	    (op->mask.bo == NULL ||
1094	     sna->kgem.batch[sna->render_state.gen5.surface_table+2] == binding_table[2])) {
1095		sna->kgem.surface += sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
1096		offset = sna->render_state.gen5.surface_table;
1097	}
1098
1099	gen5_emit_state(sna, op, offset | dirty);
1100}
1101
1102fastcall static void
1103gen5_render_composite_blt(struct sna *sna,
1104			  const struct sna_composite_op *op,
1105			  const struct sna_composite_rectangles *r)
1106{
1107	DBG(("%s: src=(%d, %d)+(%d, %d), mask=(%d, %d)+(%d, %d), dst=(%d, %d)+(%d, %d), size=(%d, %d)\n",
1108	     __FUNCTION__,
1109	     r->src.x, r->src.y, op->src.offset[0], op->src.offset[1],
1110	     r->mask.x, r->mask.y, op->mask.offset[0], op->mask.offset[1],
1111	     r->dst.x, r->dst.y, op->dst.x, op->dst.y,
1112	     r->width, r->height));
1113
1114	gen5_get_rectangles(sna, op, 1, gen5_bind_surfaces);
1115	op->prim_emit(sna, op, r);
1116}
1117
1118fastcall static void
1119gen5_render_composite_box(struct sna *sna,
1120			  const struct sna_composite_op *op,
1121			  const BoxRec *box)
1122{
1123	struct sna_composite_rectangles r;
1124
1125	DBG(("  %s: (%d, %d), (%d, %d)\n",
1126	     __FUNCTION__,
1127	     box->x1, box->y1, box->x2, box->y2));
1128
1129	gen5_get_rectangles(sna, op, 1, gen5_bind_surfaces);
1130
1131	r.dst.x = box->x1;
1132	r.dst.y = box->y1;
1133	r.width  = box->x2 - box->x1;
1134	r.height = box->y2 - box->y1;
1135	r.mask = r.src = r.dst;
1136
1137	op->prim_emit(sna, op, &r);
1138}
1139
1140static void
1141gen5_render_composite_boxes__blt(struct sna *sna,
1142				 const struct sna_composite_op *op,
1143				 const BoxRec *box, int nbox)
1144{
1145	DBG(("%s(%d) delta=(%d, %d), src=(%d, %d)/(%d, %d), mask=(%d, %d)/(%d, %d)\n",
1146	     __FUNCTION__, nbox, op->dst.x, op->dst.y,
1147	     op->src.offset[0], op->src.offset[1],
1148	     op->src.width, op->src.height,
1149	     op->mask.offset[0], op->mask.offset[1],
1150	     op->mask.width, op->mask.height));
1151
1152	do {
1153		int nbox_this_time;
1154
1155		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
1156						     gen5_bind_surfaces);
1157		nbox -= nbox_this_time;
1158
1159		do {
1160			struct sna_composite_rectangles r;
1161
1162			DBG(("  %s: (%d, %d), (%d, %d)\n",
1163			     __FUNCTION__,
1164			     box->x1, box->y1, box->x2, box->y2));
1165
1166			r.dst.x = box->x1;
1167			r.dst.y = box->y1;
1168			r.width  = box->x2 - box->x1;
1169			r.height = box->y2 - box->y1;
1170			r.mask = r.src = r.dst;
1171			op->prim_emit(sna, op, &r);
1172			box++;
1173		} while (--nbox_this_time);
1174	} while (nbox);
1175}
1176
1177static void
1178gen5_render_composite_boxes(struct sna *sna,
1179			    const struct sna_composite_op *op,
1180			    const BoxRec *box, int nbox)
1181{
1182	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1183
1184	do {
1185		int nbox_this_time;
1186		float *v;
1187
1188		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
1189						     gen5_bind_surfaces);
1190		assert(nbox_this_time);
1191		nbox -= nbox_this_time;
1192
1193		v = sna->render.vertices + sna->render.vertex_used;
1194		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1195
1196		op->emit_boxes(op, box, nbox_this_time, v);
1197		box += nbox_this_time;
1198	} while (nbox);
1199}
1200
1201static void
1202gen5_render_composite_boxes__thread(struct sna *sna,
1203				    const struct sna_composite_op *op,
1204				    const BoxRec *box, int nbox)
1205{
1206	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1207
1208	sna_vertex_lock(&sna->render);
1209	do {
1210		int nbox_this_time;
1211		float *v;
1212
1213		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
1214						     gen5_bind_surfaces);
1215		assert(nbox_this_time);
1216		nbox -= nbox_this_time;
1217
1218		v = sna->render.vertices + sna->render.vertex_used;
1219		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1220
1221		sna_vertex_acquire__locked(&sna->render);
1222		sna_vertex_unlock(&sna->render);
1223
1224		op->emit_boxes(op, box, nbox_this_time, v);
1225		box += nbox_this_time;
1226
1227		sna_vertex_lock(&sna->render);
1228		sna_vertex_release__locked(&sna->render);
1229	} while (nbox);
1230	sna_vertex_unlock(&sna->render);
1231}
1232
1233#ifndef MAX
1234#define MAX(a,b) ((a) > (b) ? (a) : (b))
1235#endif
1236
1237static uint32_t gen5_bind_video_source(struct sna *sna,
1238				       struct kgem_bo *src_bo,
1239				       uint32_t src_offset,
1240				       int src_width,
1241				       int src_height,
1242				       int src_pitch,
1243				       uint32_t src_surf_format)
1244{
1245	struct gen5_surface_state *ss;
1246
1247	sna->kgem.surface -= sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
1248
1249	ss = memset(sna->kgem.batch + sna->kgem.surface, 0, sizeof(*ss));
1250	ss->ss0.surface_type = GEN5_SURFACE_2D;
1251	ss->ss0.surface_format = src_surf_format;
1252	ss->ss0.color_blend = 1;
1253
1254	ss->ss1.base_addr =
1255		kgem_add_reloc(&sna->kgem,
1256			       sna->kgem.surface + 1,
1257			       src_bo,
1258			       I915_GEM_DOMAIN_SAMPLER << 16,
1259			       src_offset);
1260
1261	ss->ss2.width  = src_width - 1;
1262	ss->ss2.height = src_height - 1;
1263	ss->ss3.pitch  = src_pitch - 1;
1264
1265	return sna->kgem.surface * sizeof(uint32_t);
1266}
1267
1268static void gen5_video_bind_surfaces(struct sna *sna,
1269				     const struct sna_composite_op *op)
1270{
1271	bool dirty = kgem_bo_is_dirty(op->dst.bo);
1272	struct sna_video_frame *frame = op->priv;
1273	uint32_t src_surf_format;
1274	uint32_t src_surf_base[6];
1275	int src_width[6];
1276	int src_height[6];
1277	int src_pitch[6];
1278	uint32_t *binding_table;
1279	uint16_t offset;
1280	int n_src, n;
1281
1282	src_surf_base[0] = 0;
1283	src_surf_base[1] = 0;
1284	src_surf_base[2] = frame->VBufOffset;
1285	src_surf_base[3] = frame->VBufOffset;
1286	src_surf_base[4] = frame->UBufOffset;
1287	src_surf_base[5] = frame->UBufOffset;
1288
1289	if (is_planar_fourcc(frame->id)) {
1290		src_surf_format = GEN5_SURFACEFORMAT_R8_UNORM;
1291		src_width[1]  = src_width[0]  = frame->width;
1292		src_height[1] = src_height[0] = frame->height;
1293		src_pitch[1]  = src_pitch[0]  = frame->pitch[1];
1294		src_width[4]  = src_width[5]  = src_width[2]  = src_width[3] =
1295			frame->width / 2;
1296		src_height[4] = src_height[5] = src_height[2] = src_height[3] =
1297			frame->height / 2;
1298		src_pitch[4]  = src_pitch[5]  = src_pitch[2]  = src_pitch[3] =
1299			frame->pitch[0];
1300		n_src = 6;
1301	} else {
1302		if (frame->id == FOURCC_UYVY)
1303			src_surf_format = GEN5_SURFACEFORMAT_YCRCB_SWAPY;
1304		else
1305			src_surf_format = GEN5_SURFACEFORMAT_YCRCB_NORMAL;
1306
1307		src_width[0]  = frame->width;
1308		src_height[0] = frame->height;
1309		src_pitch[0]  = frame->pitch[0];
1310		n_src = 1;
1311	}
1312
1313	gen5_get_batch(sna, op);
1314
1315	binding_table = gen5_composite_get_binding_table(sna, &offset);
1316	binding_table[0] =
1317		gen5_bind_bo(sna,
1318			     op->dst.bo, op->dst.width, op->dst.height,
1319			     gen5_get_dest_format(op->dst.format),
1320			     true);
1321	for (n = 0; n < n_src; n++) {
1322		binding_table[1+n] =
1323			gen5_bind_video_source(sna,
1324					       frame->bo,
1325					       src_surf_base[n],
1326					       src_width[n],
1327					       src_height[n],
1328					       src_pitch[n],
1329					       src_surf_format);
1330	}
1331
1332	gen5_emit_state(sna, op, offset | dirty);
1333}
1334
1335static bool
1336gen5_render_video(struct sna *sna,
1337		  struct sna_video *video,
1338		  struct sna_video_frame *frame,
1339		  RegionPtr dstRegion,
1340		  PixmapPtr pixmap)
1341{
1342	struct sna_composite_op tmp;
1343	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
1344	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
1345	int src_width = frame->src.x2 - frame->src.x1;
1346	int src_height = frame->src.y2 - frame->src.y1;
1347	float src_offset_x, src_offset_y;
1348	float src_scale_x, src_scale_y;
1349	int nbox, pix_xoff, pix_yoff;
1350	struct sna_pixmap *priv;
1351	BoxPtr box;
1352
1353	DBG(("%s: %dx%d -> %dx%d\n", __FUNCTION__,
1354	     src_width, src_height, dst_width, dst_height));
1355
1356	priv = sna_pixmap_force_to_gpu(pixmap, MOVE_READ | MOVE_WRITE);
1357	if (priv == NULL)
1358		return false;
1359
1360	memset(&tmp, 0, sizeof(tmp));
1361
1362	tmp.op = PictOpSrc;
1363	tmp.dst.pixmap = pixmap;
1364	tmp.dst.width  = pixmap->drawable.width;
1365	tmp.dst.height = pixmap->drawable.height;
1366	tmp.dst.format = sna_format_for_depth(pixmap->drawable.depth);
1367	tmp.dst.bo = priv->gpu_bo;
1368
1369	if (src_width == dst_width && src_height == dst_height)
1370		tmp.src.filter = SAMPLER_FILTER_NEAREST;
1371	else
1372		tmp.src.filter = SAMPLER_FILTER_BILINEAR;
1373	tmp.src.repeat = SAMPLER_EXTEND_PAD;
1374	tmp.src.bo = frame->bo;
1375	tmp.mask.bo = NULL;
1376	tmp.u.gen5.wm_kernel =
1377		is_planar_fourcc(frame->id) ? WM_KERNEL_VIDEO_PLANAR : WM_KERNEL_VIDEO_PACKED;
1378	tmp.u.gen5.ve_id = 2;
1379	tmp.is_affine = true;
1380	tmp.floats_per_vertex = 3;
1381	tmp.floats_per_rect = 9;
1382	tmp.priv = frame;
1383
1384	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
1385		kgem_submit(&sna->kgem);
1386		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
1387	}
1388
1389	gen5_video_bind_surfaces(sna, &tmp);
1390	gen5_align_vertex(sna, &tmp);
1391
1392	/* Set up the offset for translating from the given region (in screen
1393	 * coordinates) to the backing pixmap.
1394	 */
1395#ifdef COMPOSITE
1396	pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
1397	pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
1398#else
1399	pix_xoff = 0;
1400	pix_yoff = 0;
1401#endif
1402
1403	src_scale_x = (float)src_width / dst_width / frame->width;
1404	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
1405
1406	src_scale_y = (float)src_height / dst_height / frame->height;
1407	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
1408
1409	box = REGION_RECTS(dstRegion);
1410	nbox = REGION_NUM_RECTS(dstRegion);
1411	while (nbox--) {
1412		BoxRec r;
1413
1414		r.x1 = box->x1 + pix_xoff;
1415		r.x2 = box->x2 + pix_xoff;
1416		r.y1 = box->y1 + pix_yoff;
1417		r.y2 = box->y2 + pix_yoff;
1418
1419		gen5_get_rectangles(sna, &tmp, 1, gen5_video_bind_surfaces);
1420
1421		OUT_VERTEX(r.x2, r.y2);
1422		OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
1423		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
1424
1425		OUT_VERTEX(r.x1, r.y2);
1426		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
1427		OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
1428
1429		OUT_VERTEX(r.x1, r.y1);
1430		OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
1431		OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
1432
1433		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
1434			sna_damage_add_box(&priv->gpu_damage, &r);
1435			sna_damage_subtract_box(&priv->cpu_damage, &r);
1436		}
1437		box++;
1438	}
1439
1440	gen4_vertex_flush(sna);
1441	return true;
1442}
1443
1444static int
1445gen5_composite_picture(struct sna *sna,
1446		       PicturePtr picture,
1447		       struct sna_composite_channel *channel,
1448		       int x, int y,
1449		       int w, int h,
1450		       int dst_x, int dst_y,
1451		       bool precise)
1452{
1453	PixmapPtr pixmap;
1454	uint32_t color;
1455	int16_t dx, dy;
1456
1457	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
1458	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
1459
1460	channel->is_solid = false;
1461	channel->card_format = -1;
1462
1463	if (sna_picture_is_solid(picture, &color))
1464		return gen4_channel_init_solid(sna, channel, color);
1465
1466	if (picture->pDrawable == NULL) {
1467		int ret;
1468
1469		if (picture->pSourcePict->type == SourcePictTypeLinear)
1470			return gen4_channel_init_linear(sna, picture, channel,
1471							x, y,
1472							w, h,
1473							dst_x, dst_y);
1474
1475		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
1476		ret = -1;
1477		if (!precise)
1478			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
1479								      x, y, w, h, dst_x, dst_y);
1480		if (ret == -1)
1481			ret = sna_render_picture_fixup(sna, picture, channel,
1482						       x, y, w, h, dst_x, dst_y);
1483		return ret;
1484	}
1485
1486	if (picture->alphaMap) {
1487		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
1488		return sna_render_picture_fixup(sna, picture, channel,
1489						x, y, w, h, dst_x, dst_y);
1490	}
1491
1492	if (!gen5_check_repeat(picture))
1493		return sna_render_picture_fixup(sna, picture, channel,
1494						x, y, w, h, dst_x, dst_y);
1495
1496	if (!gen5_check_filter(picture))
1497		return sna_render_picture_fixup(sna, picture, channel,
1498						x, y, w, h, dst_x, dst_y);
1499
1500	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
1501	channel->filter = picture->filter;
1502
1503	pixmap = get_drawable_pixmap(picture->pDrawable);
1504	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
1505
1506	x += dx + picture->pDrawable->x;
1507	y += dy + picture->pDrawable->y;
1508
1509	channel->is_affine = sna_transform_is_affine(picture->transform);
1510	if (sna_transform_is_integer_translation(picture->transform, &dx, &dy)) {
1511		DBG(("%s: integer translation (%d, %d), removing\n",
1512		     __FUNCTION__, dx, dy));
1513		x += dx;
1514		y += dy;
1515		channel->transform = NULL;
1516		channel->filter = PictFilterNearest;
1517	} else
1518		channel->transform = picture->transform;
1519
1520	channel->pict_format = picture->format;
1521	channel->card_format = gen5_get_card_format(picture->format);
1522	if (channel->card_format == -1)
1523		return sna_render_picture_convert(sna, picture, channel, pixmap,
1524						  x, y, w, h, dst_x, dst_y,
1525						  false);
1526
1527	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
1528		return sna_render_picture_extract(sna, picture, channel,
1529						  x, y, w, h, dst_x, dst_y);
1530
1531	return sna_render_pixmap_bo(sna, channel, pixmap,
1532				    x, y, w, h, dst_x, dst_y);
1533}
1534
1535static void gen5_composite_channel_convert(struct sna_composite_channel *channel)
1536{
1537	channel->repeat = gen5_repeat(channel->repeat);
1538	channel->filter = gen5_filter(channel->filter);
1539	if (channel->card_format == (unsigned)-1)
1540		channel->card_format = gen5_get_card_format(channel->pict_format);
1541}
1542
1543static void
1544gen5_render_composite_done(struct sna *sna,
1545			   const struct sna_composite_op *op)
1546{
1547	if (sna->render.vertex_offset) {
1548		gen4_vertex_flush(sna);
1549		gen5_magic_ca_pass(sna,op);
1550	}
1551
1552	DBG(("%s()\n", __FUNCTION__));
1553
1554	if (op->mask.bo)
1555		kgem_bo_destroy(&sna->kgem, op->mask.bo);
1556	if (op->src.bo)
1557		kgem_bo_destroy(&sna->kgem, op->src.bo);
1558
1559	sna_render_composite_redirect_done(sna, op);
1560}
1561
1562static bool
1563gen5_composite_set_target(struct sna *sna,
1564			  struct sna_composite_op *op,
1565			  PicturePtr dst,
1566			  int x, int y, int w, int h,
1567			  bool partial)
1568{
1569	BoxRec box;
1570
1571	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
1572	op->dst.width  = op->dst.pixmap->drawable.width;
1573	op->dst.height = op->dst.pixmap->drawable.height;
1574	op->dst.format = dst->format;
1575	if (w && h) {
1576		box.x1 = x;
1577		box.y1 = y;
1578		box.x2 = x + w;
1579		box.y2 = y + h;
1580	} else
1581		sna_render_picture_extents(dst, &box);
1582
1583	op->dst.bo = sna_drawable_use_bo (dst->pDrawable,
1584					  PREFER_GPU | FORCE_GPU | RENDER_GPU,
1585					  &box, &op->damage);
1586	if (op->dst.bo == NULL)
1587		return false;
1588
1589	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
1590			    &op->dst.x, &op->dst.y);
1591
1592	DBG(("%s: pixmap=%p, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
1593	     __FUNCTION__,
1594	     op->dst.pixmap, (int)op->dst.format,
1595	     op->dst.width, op->dst.height,
1596	     op->dst.bo->pitch,
1597	     op->dst.x, op->dst.y,
1598	     op->damage ? *op->damage : (void *)-1));
1599
1600	assert(op->dst.bo->proxy == NULL);
1601
1602	if (too_large(op->dst.width, op->dst.height) &&
1603	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
1604		return false;
1605
1606	return true;
1607}
1608
1609static bool
1610try_blt(struct sna *sna,
1611	PicturePtr dst, PicturePtr src,
1612	int width, int height)
1613{
1614	if (sna->kgem.mode != KGEM_RENDER) {
1615		DBG(("%s: already performing BLT\n", __FUNCTION__));
1616		return true;
1617	}
1618
1619	if (too_large(width, height)) {
1620		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
1621		     __FUNCTION__, width, height));
1622		return true;
1623	}
1624
1625	if (too_large(dst->pDrawable->width, dst->pDrawable->height))
1626		return true;
1627
1628	/* The blitter is much faster for solids */
1629	if (sna_picture_is_solid(src, NULL))
1630		return true;
1631
1632	/* is the source picture only in cpu memory e.g. a shm pixmap? */
1633	return picture_is_cpu(sna, src);
1634}
1635
1636static bool
1637is_gradient(PicturePtr picture, bool precise)
1638{
1639	if (picture->pDrawable)
1640		return false;
1641
1642	switch (picture->pSourcePict->type) {
1643	case SourcePictTypeSolidFill:
1644	case SourcePictTypeLinear:
1645		return false;
1646	default:
1647		return precise;
1648	}
1649}
1650
1651static bool
1652has_alphamap(PicturePtr p)
1653{
1654	return p->alphaMap != NULL;
1655}
1656
1657static bool
1658need_upload(struct sna *sna, PicturePtr p)
1659{
1660	return p->pDrawable && untransformed(p) &&
1661		!is_gpu(sna, p->pDrawable, PREFER_GPU_RENDER);
1662}
1663
1664static bool
1665source_is_busy(PixmapPtr pixmap)
1666{
1667	struct sna_pixmap *priv = sna_pixmap(pixmap);
1668	if (priv == NULL)
1669		return false;
1670
1671	if (priv->clear)
1672		return false;
1673
1674	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
1675		return true;
1676
1677	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
1678		return true;
1679
1680	return priv->gpu_damage && !priv->cpu_damage;
1681}
1682
1683static bool
1684source_fallback(struct sna *sna, PicturePtr p, PixmapPtr pixmap, bool precise)
1685{
1686	if (sna_picture_is_solid(p, NULL))
1687		return false;
1688
1689	if (is_gradient(p, precise) ||
1690	    !gen5_check_repeat(p) ||
1691	    !gen5_check_format(p->format))
1692		return true;
1693
1694	if (pixmap && source_is_busy(pixmap))
1695		return false;
1696
1697	return has_alphamap(p) || !gen5_check_filter(p) || need_upload(sna, p);
1698}
1699
1700static bool
1701gen5_composite_fallback(struct sna *sna,
1702			PicturePtr src,
1703			PicturePtr mask,
1704			PicturePtr dst)
1705{
1706	PixmapPtr src_pixmap;
1707	PixmapPtr mask_pixmap;
1708	PixmapPtr dst_pixmap;
1709	bool src_fallback, mask_fallback;
1710
1711	if (!gen5_check_dst_format(dst->format)) {
1712		DBG(("%s: unknown destination format: %d\n",
1713		     __FUNCTION__, dst->format));
1714		return true;
1715	}
1716
1717	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
1718
1719	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
1720	src_fallback = source_fallback(sna, src, src_pixmap,
1721				       dst->polyMode == PolyModePrecise);
1722
1723	if (mask) {
1724		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
1725		mask_fallback = source_fallback(sna, mask, mask_pixmap,
1726						dst->polyMode == PolyModePrecise);
1727	} else {
1728		mask_pixmap = NULL;
1729		mask_fallback = false;
1730	}
1731
1732	/* If we are using the destination as a source and need to
1733	 * readback in order to upload the source, do it all
1734	 * on the cpu.
1735	 */
1736	if (src_pixmap == dst_pixmap && src_fallback) {
1737		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
1738		return true;
1739	}
1740	if (mask_pixmap == dst_pixmap && mask_fallback) {
1741		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
1742		return true;
1743	}
1744
1745	/* If anything is on the GPU, push everything out to the GPU */
1746	if (dst_use_gpu(dst_pixmap)) {
1747		DBG(("%s: dst is already on the GPU, try to use GPU\n",
1748		     __FUNCTION__));
1749		return false;
1750	}
1751
1752	if (src_pixmap && !src_fallback) {
1753		DBG(("%s: src is already on the GPU, try to use GPU\n",
1754		     __FUNCTION__));
1755		return false;
1756	}
1757	if (mask_pixmap && !mask_fallback) {
1758		DBG(("%s: mask is already on the GPU, try to use GPU\n",
1759		     __FUNCTION__));
1760		return false;
1761	}
1762
1763	/* However if the dst is not on the GPU and we need to
1764	 * render one of the sources using the CPU, we may
1765	 * as well do the entire operation in place onthe CPU.
1766	 */
1767	if (src_fallback) {
1768		DBG(("%s: dst is on the CPU and src will fallback\n",
1769		     __FUNCTION__));
1770		return true;
1771	}
1772
1773	if (mask_fallback) {
1774		DBG(("%s: dst is on the CPU and mask will fallback\n",
1775		     __FUNCTION__));
1776		return true;
1777	}
1778
1779	if (too_large(dst_pixmap->drawable.width,
1780		      dst_pixmap->drawable.height) &&
1781	    dst_is_cpu(dst_pixmap)) {
1782		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
1783		return true;
1784	}
1785
1786	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
1787	     __FUNCTION__));
1788	return dst_use_cpu(dst_pixmap);
1789}
1790
1791static int
1792reuse_source(struct sna *sna,
1793	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
1794	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
1795{
1796	uint32_t color;
1797
1798	if (src_x != msk_x || src_y != msk_y)
1799		return false;
1800
1801	if (src == mask) {
1802		DBG(("%s: mask is source\n", __FUNCTION__));
1803		*mc = *sc;
1804		mc->bo = kgem_bo_reference(mc->bo);
1805		return true;
1806	}
1807
1808	if (sna_picture_is_solid(mask, &color))
1809		return gen4_channel_init_solid(sna, mc, color);
1810
1811	if (sc->is_solid)
1812		return false;
1813
1814	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
1815		return false;
1816
1817	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
1818
1819	if (!sna_transform_equal(src->transform, mask->transform))
1820		return false;
1821
1822	if (!sna_picture_alphamap_equal(src, mask))
1823		return false;
1824
1825	if (!gen5_check_repeat(mask))
1826		return false;
1827
1828	if (!gen5_check_filter(mask))
1829		return false;
1830
1831	if (!gen5_check_format(mask->format))
1832		return false;
1833
1834	DBG(("%s: reusing source channel for mask with a twist\n",
1835	     __FUNCTION__));
1836
1837	*mc = *sc;
1838	mc->repeat = gen5_repeat(mask->repeat ? mask->repeatType : RepeatNone);
1839	mc->filter = gen5_filter(mask->filter);
1840	mc->pict_format = mask->format;
1841	mc->card_format = gen5_get_card_format(mask->format);
1842	mc->bo = kgem_bo_reference(mc->bo);
1843	return true;
1844}
1845
1846static bool
1847gen5_render_composite(struct sna *sna,
1848		      uint8_t op,
1849		      PicturePtr src,
1850		      PicturePtr mask,
1851		      PicturePtr dst,
1852		      int16_t src_x, int16_t src_y,
1853		      int16_t msk_x, int16_t msk_y,
1854		      int16_t dst_x, int16_t dst_y,
1855		      int16_t width, int16_t height,
1856		      struct sna_composite_op *tmp)
1857{
1858	DBG(("%s: %dx%d, current mode=%d\n", __FUNCTION__,
1859	     width, height, sna->kgem.mode));
1860
1861	if (op >= ARRAY_SIZE(gen5_blend_op)) {
1862		DBG(("%s: unhandled blend op %d\n", __FUNCTION__, op));
1863		return false;
1864	}
1865
1866	if (mask == NULL &&
1867	    try_blt(sna, dst, src, width, height) &&
1868	    sna_blt_composite(sna, op,
1869			      src, dst,
1870			      src_x, src_y,
1871			      dst_x, dst_y,
1872			      width, height,
1873			      tmp, false))
1874		return true;
1875
1876	if (gen5_composite_fallback(sna, src, mask, dst))
1877		return false;
1878
1879	if (need_tiling(sna, width, height))
1880		return sna_tiling_composite(op, src, mask, dst,
1881					    src_x, src_y,
1882					    msk_x, msk_y,
1883					    dst_x, dst_y,
1884					    width, height,
1885					    tmp);
1886
1887	if (!gen5_composite_set_target(sna, tmp, dst,
1888				       dst_x, dst_y, width, height,
1889				       op > PictOpSrc || dst->pCompositeClip->data)) {
1890		DBG(("%s: failed to set composite target\n", __FUNCTION__));
1891		return false;
1892	}
1893
1894	DBG(("%s: preparing source\n", __FUNCTION__));
1895	tmp->op = op;
1896	switch (gen5_composite_picture(sna, src, &tmp->src,
1897				       src_x, src_y,
1898				       width, height,
1899				       dst_x, dst_y,
1900				       dst->polyMode == PolyModePrecise)) {
1901	case -1:
1902		DBG(("%s: failed to prepare source picture\n", __FUNCTION__));
1903		goto cleanup_dst;
1904	case 0:
1905		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
1906			goto cleanup_dst;
1907		/* fall through to fixup */
1908	case 1:
1909		if (mask == NULL &&
1910		    sna_blt_composite__convert(sna,
1911					       dst_x, dst_y, width, height,
1912					       tmp))
1913			return true;
1914
1915		gen5_composite_channel_convert(&tmp->src);
1916		break;
1917	}
1918
1919	tmp->is_affine = tmp->src.is_affine;
1920	tmp->has_component_alpha = false;
1921	tmp->need_magic_ca_pass = false;
1922
1923	if (mask) {
1924		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
1925			tmp->has_component_alpha = true;
1926
1927			/* Check if it's component alpha that relies on a source alpha and on
1928			 * the source value.  We can only get one of those into the single
1929			 * source value that we get to blend with.
1930			 */
1931			if (gen5_blend_op[op].src_alpha &&
1932			    (gen5_blend_op[op].src_blend != GEN5_BLENDFACTOR_ZERO)) {
1933				if (op != PictOpOver) {
1934					DBG(("%s: unhandled CA blend op %d\n", __FUNCTION__, op));
1935					goto cleanup_src;
1936				}
1937
1938				tmp->need_magic_ca_pass = true;
1939				tmp->op = PictOpOutReverse;
1940			}
1941		}
1942
1943		if (!reuse_source(sna,
1944				  src, &tmp->src, src_x, src_y,
1945				  mask, &tmp->mask, msk_x, msk_y)) {
1946			DBG(("%s: preparing mask\n", __FUNCTION__));
1947			switch (gen5_composite_picture(sna, mask, &tmp->mask,
1948						       msk_x, msk_y,
1949						       width, height,
1950						       dst_x, dst_y,
1951						       dst->polyMode == PolyModePrecise)) {
1952			case -1:
1953				DBG(("%s: failed to prepare mask picture\n", __FUNCTION__));
1954				goto cleanup_src;
1955			case 0:
1956				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
1957					goto cleanup_src;
1958				/* fall through to fixup */
1959			case 1:
1960				gen5_composite_channel_convert(&tmp->mask);
1961				break;
1962			}
1963		}
1964
1965		tmp->is_affine &= tmp->mask.is_affine;
1966	}
1967
1968	tmp->u.gen5.wm_kernel =
1969		gen5_choose_composite_kernel(tmp->op,
1970					     tmp->mask.bo != NULL,
1971					     tmp->has_component_alpha,
1972					     tmp->is_affine);
1973	tmp->u.gen5.ve_id = gen4_choose_composite_emitter(sna, tmp);
1974
1975	tmp->blt   = gen5_render_composite_blt;
1976	tmp->box   = gen5_render_composite_box;
1977	tmp->boxes = gen5_render_composite_boxes__blt;
1978	if (tmp->emit_boxes) {
1979		tmp->boxes = gen5_render_composite_boxes;
1980		tmp->thread_boxes = gen5_render_composite_boxes__thread;
1981	}
1982	tmp->done  = gen5_render_composite_done;
1983
1984	if (!kgem_check_bo(&sna->kgem,
1985			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo, NULL)) {
1986		kgem_submit(&sna->kgem);
1987		if (!kgem_check_bo(&sna->kgem,
1988				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo, NULL))
1989			goto cleanup_mask;
1990	}
1991
1992	gen5_bind_surfaces(sna, tmp);
1993	gen5_align_vertex(sna, tmp);
1994	return true;
1995
1996cleanup_mask:
1997	if (tmp->mask.bo)
1998		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
1999cleanup_src:
2000	if (tmp->src.bo)
2001		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
2002cleanup_dst:
2003	if (tmp->redirect.real_bo)
2004		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
2005	return false;
2006}
2007
2008#if !NO_COMPOSITE_SPANS
2009fastcall static void
2010gen5_render_composite_spans_box(struct sna *sna,
2011				const struct sna_composite_spans_op *op,
2012				const BoxRec *box, float opacity)
2013{
2014	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
2015	     __FUNCTION__,
2016	     op->base.src.offset[0], op->base.src.offset[1],
2017	     opacity,
2018	     op->base.dst.x, op->base.dst.y,
2019	     box->x1, box->y1,
2020	     box->x2 - box->x1,
2021	     box->y2 - box->y1));
2022
2023	gen5_get_rectangles(sna, &op->base, 1, gen5_bind_surfaces);
2024	op->prim_emit(sna, op, box, opacity);
2025}
2026
2027static void
2028gen5_render_composite_spans_boxes(struct sna *sna,
2029				  const struct sna_composite_spans_op *op,
2030				  const BoxRec *box, int nbox,
2031				  float opacity)
2032{
2033	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
2034	     __FUNCTION__, nbox,
2035	     op->base.src.offset[0], op->base.src.offset[1],
2036	     opacity,
2037	     op->base.dst.x, op->base.dst.y));
2038
2039	do {
2040		int nbox_this_time;
2041
2042		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
2043						     gen5_bind_surfaces);
2044		nbox -= nbox_this_time;
2045
2046		do {
2047			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
2048			     box->x1, box->y1,
2049			     box->x2 - box->x1,
2050			     box->y2 - box->y1));
2051
2052			op->prim_emit(sna, op, box++, opacity);
2053		} while (--nbox_this_time);
2054	} while (nbox);
2055}
2056
2057fastcall static void
2058gen5_render_composite_spans_boxes__thread(struct sna *sna,
2059					  const struct sna_composite_spans_op *op,
2060					  const struct sna_opacity_box *box,
2061					  int nbox)
2062{
2063	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
2064	     __FUNCTION__, nbox,
2065	     op->base.src.offset[0], op->base.src.offset[1],
2066	     op->base.dst.x, op->base.dst.y));
2067
2068	sna_vertex_lock(&sna->render);
2069	do {
2070		int nbox_this_time;
2071		float *v;
2072
2073		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
2074						     gen5_bind_surfaces);
2075		assert(nbox_this_time);
2076		nbox -= nbox_this_time;
2077
2078		v = sna->render.vertices + sna->render.vertex_used;
2079		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
2080
2081		sna_vertex_acquire__locked(&sna->render);
2082		sna_vertex_unlock(&sna->render);
2083
2084		op->emit_boxes(op, box, nbox_this_time, v);
2085		box += nbox_this_time;
2086
2087		sna_vertex_lock(&sna->render);
2088		sna_vertex_release__locked(&sna->render);
2089	} while (nbox);
2090	sna_vertex_unlock(&sna->render);
2091}
2092
2093fastcall static void
2094gen5_render_composite_spans_done(struct sna *sna,
2095				 const struct sna_composite_spans_op *op)
2096{
2097	if (sna->render.vertex_offset)
2098		gen4_vertex_flush(sna);
2099
2100	DBG(("%s()\n", __FUNCTION__));
2101
2102	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2103	sna_render_composite_redirect_done(sna, &op->base);
2104}
2105
2106static bool
2107gen5_check_composite_spans(struct sna *sna,
2108			   uint8_t op, PicturePtr src, PicturePtr dst,
2109			   int16_t width, int16_t height,
2110			   unsigned flags)
2111{
2112	DBG(("%s: op=%d, width=%d, height=%d, flags=%x\n",
2113	     __FUNCTION__, op, width, height, flags));
2114
2115	if (op >= ARRAY_SIZE(gen5_blend_op))
2116		return false;
2117
2118	if (gen5_composite_fallback(sna, src, NULL, dst)) {
2119		DBG(("%s: operation would fallback\n", __FUNCTION__));
2120		return false;
2121	}
2122
2123	if (need_tiling(sna, width, height) &&
2124	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
2125		DBG(("%s: fallback, tiled operation not on GPU\n",
2126		     __FUNCTION__));
2127		return false;
2128	}
2129
2130	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0) {
2131		struct sna_pixmap *priv = sna_pixmap_from_drawable(dst->pDrawable);
2132		assert(priv);
2133
2134		if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
2135			return true;
2136
2137		if (flags & COMPOSITE_SPANS_INPLACE_HINT)
2138			return false;
2139
2140		if ((sna->render.prefer_gpu & PREFER_GPU_SPANS) == 0 &&
2141		    dst->format == PICT_a8)
2142			return false;
2143
2144		return priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo);
2145	}
2146
2147	return true;
2148}
2149
2150static bool
2151gen5_render_composite_spans(struct sna *sna,
2152			    uint8_t op,
2153			    PicturePtr src,
2154			    PicturePtr dst,
2155			    int16_t src_x,  int16_t src_y,
2156			    int16_t dst_x,  int16_t dst_y,
2157			    int16_t width,  int16_t height,
2158			    unsigned flags,
2159			    struct sna_composite_spans_op *tmp)
2160{
2161	DBG(("%s: %dx%d with flags=%x, current mode=%d\n", __FUNCTION__,
2162	     width, height, flags, sna->kgem.ring));
2163
2164	assert(gen5_check_composite_spans(sna, op, src, dst, width, height, flags));
2165
2166	if (need_tiling(sna, width, height)) {
2167		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
2168		     __FUNCTION__, width, height));
2169		return sna_tiling_composite_spans(op, src, dst,
2170						  src_x, src_y, dst_x, dst_y,
2171						  width, height, flags, tmp);
2172	}
2173
2174	tmp->base.op = op;
2175	if (!gen5_composite_set_target(sna, &tmp->base, dst,
2176				       dst_x, dst_y, width, height,
2177				       true))
2178		return false;
2179
2180	switch (gen5_composite_picture(sna, src, &tmp->base.src,
2181				       src_x, src_y,
2182				       width, height,
2183				       dst_x, dst_y,
2184				       dst->polyMode == PolyModePrecise)) {
2185	case -1:
2186		goto cleanup_dst;
2187	case 0:
2188		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
2189			goto cleanup_dst;
2190		/* fall through to fixup */
2191	case 1:
2192		gen5_composite_channel_convert(&tmp->base.src);
2193		break;
2194	}
2195
2196	tmp->base.mask.bo = NULL;
2197
2198	tmp->base.is_affine = tmp->base.src.is_affine;
2199	tmp->base.has_component_alpha = false;
2200	tmp->base.need_magic_ca_pass = false;
2201
2202	tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(sna, tmp);
2203	tmp->base.u.gen5.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
2204
2205	tmp->box   = gen5_render_composite_spans_box;
2206	tmp->boxes = gen5_render_composite_spans_boxes;
2207	if (tmp->emit_boxes)
2208		tmp->thread_boxes = gen5_render_composite_spans_boxes__thread;
2209	tmp->done  = gen5_render_composite_spans_done;
2210
2211	if (!kgem_check_bo(&sna->kgem,
2212			   tmp->base.dst.bo, tmp->base.src.bo,
2213			   NULL))  {
2214		kgem_submit(&sna->kgem);
2215		if (!kgem_check_bo(&sna->kgem,
2216				   tmp->base.dst.bo, tmp->base.src.bo,
2217				   NULL))
2218			goto cleanup_src;
2219	}
2220
2221	gen5_bind_surfaces(sna, &tmp->base);
2222	gen5_align_vertex(sna, &tmp->base);
2223	return true;
2224
2225cleanup_src:
2226	if (tmp->base.src.bo)
2227		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
2228cleanup_dst:
2229	if (tmp->base.redirect.real_bo)
2230		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
2231	return false;
2232}
2233#endif
2234
2235static void
2236gen5_copy_bind_surfaces(struct sna *sna,
2237			const struct sna_composite_op *op)
2238{
2239	bool dirty = kgem_bo_is_dirty(op->dst.bo);
2240	uint32_t *binding_table;
2241	uint16_t offset;
2242
2243	gen5_get_batch(sna, op);
2244
2245	binding_table = gen5_composite_get_binding_table(sna, &offset);
2246
2247	binding_table[0] =
2248		gen5_bind_bo(sna,
2249			     op->dst.bo, op->dst.width, op->dst.height,
2250			     gen5_get_dest_format(op->dst.format),
2251			     true);
2252	binding_table[1] =
2253		gen5_bind_bo(sna,
2254			     op->src.bo, op->src.width, op->src.height,
2255			     op->src.card_format,
2256			     false);
2257
2258	if (sna->kgem.surface == offset &&
2259	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen5.surface_table) == *(uint64_t*)binding_table) {
2260		sna->kgem.surface += sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
2261		offset = sna->render_state.gen5.surface_table;
2262	}
2263
2264	gen5_emit_state(sna, op, offset | dirty);
2265}
2266
2267static bool
2268gen5_render_copy_boxes(struct sna *sna, uint8_t alu,
2269		       PixmapPtr src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
2270		       PixmapPtr dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
2271		       const BoxRec *box, int n, unsigned flags)
2272{
2273	struct sna_composite_op tmp;
2274
2275	DBG(("%s alu=%d, src=%ld:handle=%d, dst=%ld:handle=%d boxes=%d x [((%d, %d), (%d, %d))...], flags=%x\n",
2276	     __FUNCTION__, alu,
2277	     src->drawable.serialNumber, src_bo->handle,
2278	     dst->drawable.serialNumber, dst_bo->handle,
2279	     n, box->x1, box->y1, box->x2, box->y2,
2280	     flags));
2281
2282	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2283	    sna_blt_copy_boxes(sna, alu,
2284			       src_bo, src_dx, src_dy,
2285			       dst_bo, dst_dx, dst_dy,
2286			       dst->drawable.bitsPerPixel,
2287			       box, n))
2288		return true;
2289
2290	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo) {
2291fallback_blt:
2292		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
2293			return false;
2294
2295		return sna_blt_copy_boxes_fallback(sna, alu,
2296						   src, src_bo, src_dx, src_dy,
2297						   dst, dst_bo, dst_dx, dst_dy,
2298						   box, n);
2299	}
2300
2301	memset(&tmp, 0, sizeof(tmp));
2302
2303	if (dst->drawable.depth == src->drawable.depth) {
2304		tmp.dst.format = sna_render_format_for_depth(dst->drawable.depth);
2305		tmp.src.pict_format = tmp.dst.format;
2306	} else {
2307		tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
2308		tmp.src.pict_format = sna_format_for_depth(src->drawable.depth);
2309	}
2310	if (!gen5_check_format(tmp.src.pict_format)) {
2311		DBG(("%s: unsupported source format, %x, use BLT\n",
2312		     __FUNCTION__, tmp.src.pict_format));
2313		goto fallback_blt;
2314	}
2315
2316	DBG(("%s (%d, %d)->(%d, %d) x %d\n",
2317	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n));
2318
2319	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
2320
2321	tmp.dst.pixmap = dst;
2322	tmp.dst.width  = dst->drawable.width;
2323	tmp.dst.height = dst->drawable.height;
2324	tmp.dst.x = tmp.dst.y = 0;
2325	tmp.dst.bo = dst_bo;
2326	tmp.damage = NULL;
2327
2328	sna_render_composite_redirect_init(&tmp);
2329	if (too_large(tmp.dst.width, tmp.dst.height)) {
2330		BoxRec extents = box[0];
2331		int i;
2332
2333		for (i = 1; i < n; i++) {
2334			if (box[i].x1 < extents.x1)
2335				extents.x1 = box[i].x1;
2336			if (box[i].y1 < extents.y1)
2337				extents.y1 = box[i].y1;
2338
2339			if (box[i].x2 > extents.x2)
2340				extents.x2 = box[i].x2;
2341			if (box[i].y2 > extents.y2)
2342				extents.y2 = box[i].y2;
2343		}
2344		if (!sna_render_composite_redirect(sna, &tmp,
2345						   extents.x1 + dst_dx,
2346						   extents.y1 + dst_dy,
2347						   extents.x2 - extents.x1,
2348						   extents.y2 - extents.y1,
2349						   n > 1))
2350			goto fallback_tiled;
2351	}
2352
2353	tmp.src.filter = SAMPLER_FILTER_NEAREST;
2354	tmp.src.repeat = SAMPLER_EXTEND_NONE;
2355	tmp.src.card_format = gen5_get_card_format(tmp.src.pict_format);
2356	if (too_large(src->drawable.width, src->drawable.height)) {
2357		BoxRec extents = box[0];
2358		int i;
2359
2360		for (i = 1; i < n; i++) {
2361			if (box[i].x1 < extents.x1)
2362				extents.x1 = box[i].x1;
2363			if (box[i].y1 < extents.y1)
2364				extents.y1 = box[i].y1;
2365
2366			if (box[i].x2 > extents.x2)
2367				extents.x2 = box[i].x2;
2368			if (box[i].y2 > extents.y2)
2369				extents.y2 = box[i].y2;
2370		}
2371
2372		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
2373					       extents.x1 + src_dx,
2374					       extents.y1 + src_dy,
2375					       extents.x2 - extents.x1,
2376					       extents.y2 - extents.y1))
2377			goto fallback_tiled_dst;
2378	} else {
2379		tmp.src.bo = kgem_bo_reference(src_bo);
2380		tmp.src.width  = src->drawable.width;
2381		tmp.src.height = src->drawable.height;
2382		tmp.src.offset[0] = tmp.src.offset[1] = 0;
2383		tmp.src.scale[0] = 1.f/src->drawable.width;
2384		tmp.src.scale[1] = 1.f/src->drawable.height;
2385	}
2386
2387	tmp.is_affine = true;
2388	tmp.floats_per_vertex = 3;
2389	tmp.floats_per_rect = 9;
2390	tmp.u.gen5.wm_kernel = WM_KERNEL;
2391	tmp.u.gen5.ve_id = 2;
2392
2393	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2394		kgem_submit(&sna->kgem);
2395		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2396			DBG(("%s: aperture check failed\n", __FUNCTION__));
2397			goto fallback_tiled_src;
2398		}
2399	}
2400
2401	dst_dx += tmp.dst.x;
2402	dst_dy += tmp.dst.y;
2403	tmp.dst.x = tmp.dst.y = 0;
2404
2405	src_dx += tmp.src.offset[0];
2406	src_dy += tmp.src.offset[1];
2407
2408	gen5_copy_bind_surfaces(sna, &tmp);
2409	gen5_align_vertex(sna, &tmp);
2410
2411	do {
2412		int n_this_time;
2413
2414		n_this_time = gen5_get_rectangles(sna, &tmp, n,
2415						  gen5_copy_bind_surfaces);
2416		n -= n_this_time;
2417
2418		do {
2419			DBG(("	(%d, %d) -> (%d, %d) + (%d, %d)\n",
2420			     box->x1 + src_dx, box->y1 + src_dy,
2421			     box->x1 + dst_dx, box->y1 + dst_dy,
2422			     box->x2 - box->x1, box->y2 - box->y1));
2423			OUT_VERTEX(box->x2 + dst_dx, box->y2 + dst_dy);
2424			OUT_VERTEX_F((box->x2 + src_dx) * tmp.src.scale[0]);
2425			OUT_VERTEX_F((box->y2 + src_dy) * tmp.src.scale[1]);
2426
2427			OUT_VERTEX(box->x1 + dst_dx, box->y2 + dst_dy);
2428			OUT_VERTEX_F((box->x1 + src_dx) * tmp.src.scale[0]);
2429			OUT_VERTEX_F((box->y2 + src_dy) * tmp.src.scale[1]);
2430
2431			OUT_VERTEX(box->x1 + dst_dx, box->y1 + dst_dy);
2432			OUT_VERTEX_F((box->x1 + src_dx) * tmp.src.scale[0]);
2433			OUT_VERTEX_F((box->y1 + src_dy) * tmp.src.scale[1]);
2434
2435			box++;
2436		} while (--n_this_time);
2437	} while (n);
2438
2439	gen4_vertex_flush(sna);
2440	sna_render_composite_redirect_done(sna, &tmp);
2441	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2442	return true;
2443
2444fallback_tiled_src:
2445	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2446fallback_tiled_dst:
2447	if (tmp.redirect.real_bo)
2448		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2449fallback_tiled:
2450	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2451	    sna_blt_copy_boxes(sna, alu,
2452			       src_bo, src_dx, src_dy,
2453			       dst_bo, dst_dx, dst_dy,
2454			       dst->drawable.bitsPerPixel,
2455			       box, n))
2456		return true;
2457
2458	DBG(("%s: tiled fallback\n", __FUNCTION__));
2459	return sna_tiling_copy_boxes(sna, alu,
2460				     src, src_bo, src_dx, src_dy,
2461				     dst, dst_bo, dst_dx, dst_dy,
2462				     box, n);
2463}
2464
2465static void
2466gen5_render_copy_blt(struct sna *sna,
2467		     const struct sna_copy_op *op,
2468		     int16_t sx, int16_t sy,
2469		     int16_t w,  int16_t h,
2470		     int16_t dx, int16_t dy)
2471{
2472	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n", __FUNCTION__,
2473	     sx, sy, dx, dy, w, h));
2474
2475	gen5_get_rectangles(sna, &op->base, 1, gen5_copy_bind_surfaces);
2476
2477	OUT_VERTEX(dx+w, dy+h);
2478	OUT_VERTEX_F((sx+w)*op->base.src.scale[0]);
2479	OUT_VERTEX_F((sy+h)*op->base.src.scale[1]);
2480
2481	OUT_VERTEX(dx, dy+h);
2482	OUT_VERTEX_F(sx*op->base.src.scale[0]);
2483	OUT_VERTEX_F((sy+h)*op->base.src.scale[1]);
2484
2485	OUT_VERTEX(dx, dy);
2486	OUT_VERTEX_F(sx*op->base.src.scale[0]);
2487	OUT_VERTEX_F(sy*op->base.src.scale[1]);
2488}
2489
2490static void
2491gen5_render_copy_done(struct sna *sna,
2492		      const struct sna_copy_op *op)
2493{
2494	if (sna->render.vertex_offset)
2495		gen4_vertex_flush(sna);
2496
2497	DBG(("%s()\n", __FUNCTION__));
2498}
2499
2500static bool
2501gen5_render_copy(struct sna *sna, uint8_t alu,
2502		 PixmapPtr src, struct kgem_bo *src_bo,
2503		 PixmapPtr dst, struct kgem_bo *dst_bo,
2504		 struct sna_copy_op *op)
2505{
2506	DBG(("%s (alu=%d)\n", __FUNCTION__, alu));
2507
2508	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2509	    sna_blt_copy(sna, alu,
2510			 src_bo, dst_bo,
2511			 dst->drawable.bitsPerPixel,
2512			 op))
2513		return true;
2514
2515	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
2516	    too_large(src->drawable.width, src->drawable.height) ||
2517	    too_large(dst->drawable.width, dst->drawable.height)) {
2518fallback:
2519		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
2520			return false;
2521
2522		return sna_blt_copy(sna, alu, src_bo, dst_bo,
2523				    dst->drawable.bitsPerPixel,
2524				    op);
2525	}
2526
2527	if (dst->drawable.depth == src->drawable.depth) {
2528		op->base.dst.format = sna_render_format_for_depth(dst->drawable.depth);
2529		op->base.src.pict_format = op->base.dst.format;
2530	} else {
2531		op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
2532		op->base.src.pict_format = sna_format_for_depth(src->drawable.depth);
2533	}
2534	if (!gen5_check_format(op->base.src.pict_format))
2535		goto fallback;
2536
2537	op->base.op = alu == GXcopy ? PictOpSrc : PictOpClear;
2538
2539	op->base.dst.pixmap = dst;
2540	op->base.dst.width  = dst->drawable.width;
2541	op->base.dst.height = dst->drawable.height;
2542	op->base.dst.bo = dst_bo;
2543
2544	op->base.src.bo = src_bo;
2545	op->base.src.card_format =
2546		gen5_get_card_format(op->base.src.pict_format);
2547	op->base.src.width  = src->drawable.width;
2548	op->base.src.height = src->drawable.height;
2549	op->base.src.scale[0] = 1.f/src->drawable.width;
2550	op->base.src.scale[1] = 1.f/src->drawable.height;
2551	op->base.src.filter = SAMPLER_FILTER_NEAREST;
2552	op->base.src.repeat = SAMPLER_EXTEND_NONE;
2553
2554	op->base.is_affine = true;
2555	op->base.floats_per_vertex = 3;
2556	op->base.floats_per_rect = 9;
2557	op->base.u.gen5.wm_kernel = WM_KERNEL;
2558	op->base.u.gen5.ve_id = 2;
2559
2560	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2561		kgem_submit(&sna->kgem);
2562		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
2563			goto fallback;
2564	}
2565
2566	if (kgem_bo_is_dirty(src_bo)) {
2567		if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2568		    sna_blt_copy(sna, alu,
2569				 src_bo, dst_bo,
2570				 dst->drawable.bitsPerPixel,
2571				 op))
2572			return true;
2573	}
2574
2575	gen5_copy_bind_surfaces(sna, &op->base);
2576	gen5_align_vertex(sna, &op->base);
2577
2578	op->blt  = gen5_render_copy_blt;
2579	op->done = gen5_render_copy_done;
2580	return true;
2581}
2582
2583static void
2584gen5_fill_bind_surfaces(struct sna *sna,
2585			const struct sna_composite_op *op)
2586{
2587	bool dirty = kgem_bo_is_dirty(op->dst.bo);
2588	uint32_t *binding_table;
2589	uint16_t offset;
2590
2591	gen5_get_batch(sna, op);
2592
2593	binding_table = gen5_composite_get_binding_table(sna, &offset);
2594
2595	binding_table[0] =
2596		gen5_bind_bo(sna,
2597			     op->dst.bo, op->dst.width, op->dst.height,
2598			     gen5_get_dest_format(op->dst.format),
2599			     true);
2600	binding_table[1] =
2601		gen5_bind_bo(sna,
2602			     op->src.bo, 1, 1,
2603			     GEN5_SURFACEFORMAT_B8G8R8A8_UNORM,
2604			     false);
2605
2606	if (sna->kgem.surface == offset &&
2607	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen5.surface_table) == *(uint64_t*)binding_table) {
2608		sna->kgem.surface +=
2609			sizeof(struct gen5_surface_state_padded)/sizeof(uint32_t);
2610		offset = sna->render_state.gen5.surface_table;
2611	}
2612
2613	gen5_emit_state(sna, op, offset | dirty);
2614}
2615
2616static inline bool prefer_blt_fill(struct sna *sna)
2617{
2618#if PREFER_BLT_FILL
2619	return true;
2620#else
2621	return sna->kgem.mode != KGEM_RENDER;
2622#endif
2623}
2624
2625static bool
2626gen5_render_fill_boxes(struct sna *sna,
2627		       CARD8 op,
2628		       PictFormat format,
2629		       const xRenderColor *color,
2630		       PixmapPtr dst, struct kgem_bo *dst_bo,
2631		       const BoxRec *box, int n)
2632{
2633	struct sna_composite_op tmp;
2634	uint32_t pixel;
2635
2636	DBG(("%s op=%x, color=(%04x,%04x,%04x,%04x), boxes=%d x [((%d, %d), (%d, %d))...]\n",
2637	     __FUNCTION__, op,
2638	     color->red, color->green, color->blue, color->alpha,
2639	     n, box->x1, box->y1, box->x2, box->y2));
2640
2641	if (op >= ARRAY_SIZE(gen5_blend_op)) {
2642		DBG(("%s: fallback due to unhandled blend op: %d\n",
2643		     __FUNCTION__, op));
2644		return false;
2645	}
2646
2647	if (op <= PictOpSrc &&
2648	    (prefer_blt_fill(sna) ||
2649	     too_large(dst->drawable.width, dst->drawable.height) ||
2650	     !gen5_check_dst_format(format))) {
2651		uint8_t alu = GXinvalid;
2652
2653		pixel = 0;
2654		if (op == PictOpClear)
2655			alu = GXclear;
2656		else if (sna_get_pixel_from_rgba(&pixel,
2657						 color->red,
2658						 color->green,
2659						 color->blue,
2660						 color->alpha,
2661						 format))
2662			alu = GXcopy;
2663
2664		if (alu != GXinvalid &&
2665		    sna_blt_fill_boxes(sna, alu,
2666				       dst_bo, dst->drawable.bitsPerPixel,
2667				       pixel, box, n))
2668			return true;
2669
2670		if (!gen5_check_dst_format(format))
2671			return false;
2672
2673		if (too_large(dst->drawable.width, dst->drawable.height))
2674			return sna_tiling_fill_boxes(sna, op, format, color,
2675						     dst, dst_bo, box, n);
2676	}
2677
2678	if (op == PictOpClear) {
2679		pixel = 0;
2680		op = PictOpSrc;
2681	} else if (!sna_get_pixel_from_rgba(&pixel,
2682					    color->red,
2683					    color->green,
2684					    color->blue,
2685					    color->alpha,
2686					    PICT_a8r8g8b8))
2687		return false;
2688
2689	DBG(("%s(%08x x %d)\n", __FUNCTION__, pixel, n));
2690
2691	memset(&tmp, 0, sizeof(tmp));
2692
2693	tmp.op = op;
2694
2695	tmp.dst.pixmap = dst;
2696	tmp.dst.width  = dst->drawable.width;
2697	tmp.dst.height = dst->drawable.height;
2698	tmp.dst.format = format;
2699	tmp.dst.bo = dst_bo;
2700
2701	tmp.src.bo = sna_render_get_solid(sna, pixel);
2702	tmp.src.filter = SAMPLER_FILTER_NEAREST;
2703	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
2704
2705	tmp.is_affine = true;
2706	tmp.floats_per_vertex = 2;
2707	tmp.floats_per_rect = 6;
2708	tmp.u.gen5.wm_kernel = WM_KERNEL;
2709	tmp.u.gen5.ve_id = 1;
2710
2711	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2712		kgem_submit(&sna->kgem);
2713		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
2714	}
2715
2716	gen5_fill_bind_surfaces(sna, &tmp);
2717	gen5_align_vertex(sna, &tmp);
2718
2719	do {
2720		int n_this_time;
2721
2722		n_this_time = gen5_get_rectangles(sna, &tmp, n,
2723						  gen5_fill_bind_surfaces);
2724		n -= n_this_time;
2725
2726		do {
2727			DBG(("	(%d, %d), (%d, %d)\n",
2728			     box->x1, box->y1, box->x2, box->y2));
2729			OUT_VERTEX(box->x2, box->y2);
2730			OUT_VERTEX_F(.5);
2731
2732			OUT_VERTEX(box->x1, box->y2);
2733			OUT_VERTEX_F(.5);
2734
2735			OUT_VERTEX(box->x1, box->y1);
2736			OUT_VERTEX_F(.5);
2737
2738			box++;
2739		} while (--n_this_time);
2740	} while (n);
2741
2742	gen4_vertex_flush(sna);
2743	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2744	return true;
2745}
2746
2747static void
2748gen5_render_fill_op_blt(struct sna *sna,
2749			const struct sna_fill_op *op,
2750			int16_t x, int16_t y, int16_t w, int16_t h)
2751{
2752	DBG(("%s (%d, %d)x(%d, %d)\n", __FUNCTION__, x,y,w,h));
2753
2754	gen5_get_rectangles(sna, &op->base, 1, gen5_fill_bind_surfaces);
2755
2756	OUT_VERTEX(x+w, y+h);
2757	OUT_VERTEX_F(.5);
2758
2759	OUT_VERTEX(x, y+h);
2760	OUT_VERTEX_F(.5);
2761
2762	OUT_VERTEX(x, y);
2763	OUT_VERTEX_F(.5);
2764}
2765
2766fastcall static void
2767gen5_render_fill_op_box(struct sna *sna,
2768			const struct sna_fill_op *op,
2769			const BoxRec *box)
2770{
2771	DBG(("%s: (%d, %d),(%d, %d)\n", __FUNCTION__,
2772	     box->x1, box->y1, box->x2, box->y2));
2773
2774	gen5_get_rectangles(sna, &op->base, 1, gen5_fill_bind_surfaces);
2775
2776	OUT_VERTEX(box->x2, box->y2);
2777	OUT_VERTEX_F(.5);
2778
2779	OUT_VERTEX(box->x1, box->y2);
2780	OUT_VERTEX_F(.5);
2781
2782	OUT_VERTEX(box->x1, box->y1);
2783	OUT_VERTEX_F(.5);
2784}
2785
2786fastcall static void
2787gen5_render_fill_op_boxes(struct sna *sna,
2788			  const struct sna_fill_op *op,
2789			  const BoxRec *box,
2790			  int nbox)
2791{
2792	DBG(("%s: (%d, %d),(%d, %d)... x %d\n", __FUNCTION__,
2793	     box->x1, box->y1, box->x2, box->y2, nbox));
2794
2795	do {
2796		int nbox_this_time;
2797
2798		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
2799						     gen5_fill_bind_surfaces);
2800		nbox -= nbox_this_time;
2801
2802		do {
2803			OUT_VERTEX(box->x2, box->y2);
2804			OUT_VERTEX_F(.5);
2805
2806			OUT_VERTEX(box->x1, box->y2);
2807			OUT_VERTEX_F(.5);
2808
2809			OUT_VERTEX(box->x1, box->y1);
2810			OUT_VERTEX_F(.5);
2811			box++;
2812		} while (--nbox_this_time);
2813	} while (nbox);
2814}
2815
2816static void
2817gen5_render_fill_op_done(struct sna *sna,
2818			 const struct sna_fill_op *op)
2819{
2820	if (sna->render.vertex_offset)
2821		gen4_vertex_flush(sna);
2822	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2823
2824	DBG(("%s()\n", __FUNCTION__));
2825}
2826
2827static bool
2828gen5_render_fill(struct sna *sna, uint8_t alu,
2829		 PixmapPtr dst, struct kgem_bo *dst_bo,
2830		 uint32_t color,
2831		 struct sna_fill_op *op)
2832{
2833	DBG(("%s(alu=%d, color=%08x)\n", __FUNCTION__, alu, color));
2834
2835	if (prefer_blt_fill(sna) &&
2836	    sna_blt_fill(sna, alu,
2837			 dst_bo, dst->drawable.bitsPerPixel,
2838			 color,
2839			 op))
2840		return true;
2841
2842	if (!(alu == GXcopy || alu == GXclear) ||
2843	    too_large(dst->drawable.width, dst->drawable.height))
2844		return sna_blt_fill(sna, alu,
2845				    dst_bo, dst->drawable.bitsPerPixel,
2846				    color,
2847				    op);
2848
2849	if (alu == GXclear)
2850		color = 0;
2851
2852	op->base.op = color == 0 ? PictOpClear : PictOpSrc;
2853
2854	op->base.dst.pixmap = dst;
2855	op->base.dst.width  = dst->drawable.width;
2856	op->base.dst.height = dst->drawable.height;
2857	op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
2858	op->base.dst.bo = dst_bo;
2859	op->base.dst.x = op->base.dst.y = 0;
2860
2861	op->base.need_magic_ca_pass = 0;
2862	op->base.has_component_alpha = 0;
2863
2864	op->base.src.bo =
2865		sna_render_get_solid(sna,
2866				     sna_rgba_for_color(color,
2867							dst->drawable.depth));
2868	op->base.src.filter = SAMPLER_FILTER_NEAREST;
2869	op->base.src.repeat = SAMPLER_EXTEND_REPEAT;
2870
2871	op->base.mask.bo = NULL;
2872	op->base.mask.filter = SAMPLER_FILTER_NEAREST;
2873	op->base.mask.repeat = SAMPLER_EXTEND_NONE;
2874
2875	op->base.is_affine = true;
2876	op->base.floats_per_vertex = 2;
2877	op->base.floats_per_rect = 6;
2878	op->base.u.gen5.wm_kernel = WM_KERNEL;
2879	op->base.u.gen5.ve_id = 1;
2880
2881	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2882		kgem_submit(&sna->kgem);
2883		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
2884	}
2885
2886	gen5_fill_bind_surfaces(sna, &op->base);
2887	gen5_align_vertex(sna, &op->base);
2888
2889	op->blt   = gen5_render_fill_op_blt;
2890	op->box   = gen5_render_fill_op_box;
2891	op->boxes = gen5_render_fill_op_boxes;
2892	op->done  = gen5_render_fill_op_done;
2893	return true;
2894}
2895
2896static bool
2897gen5_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
2898			     uint32_t color,
2899			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
2900			     uint8_t alu)
2901{
2902	BoxRec box;
2903
2904	box.x1 = x1;
2905	box.y1 = y1;
2906	box.x2 = x2;
2907	box.y2 = y2;
2908
2909	return sna_blt_fill_boxes(sna, alu,
2910				  bo, dst->drawable.bitsPerPixel,
2911				  color, &box, 1);
2912}
2913
2914static bool
2915gen5_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
2916		     uint32_t color,
2917		     int16_t x1, int16_t y1,
2918		     int16_t x2, int16_t y2,
2919		     uint8_t alu)
2920{
2921	struct sna_composite_op tmp;
2922
2923#if NO_FILL_ONE
2924	return gen5_render_fill_one_try_blt(sna, dst, bo, color,
2925					    x1, y1, x2, y2, alu);
2926#endif
2927
2928	/* Prefer to use the BLT if already engaged */
2929	if (prefer_blt_fill(sna) &&
2930	    gen5_render_fill_one_try_blt(sna, dst, bo, color,
2931					 x1, y1, x2, y2, alu))
2932		return true;
2933
2934	/* Must use the BLT if we can't RENDER... */
2935	if (!(alu == GXcopy || alu == GXclear) ||
2936	    too_large(dst->drawable.width, dst->drawable.height))
2937		return gen5_render_fill_one_try_blt(sna, dst, bo, color,
2938						    x1, y1, x2, y2, alu);
2939
2940	if (alu == GXclear)
2941		color = 0;
2942
2943	tmp.op = color == 0 ? PictOpClear : PictOpSrc;
2944
2945	tmp.dst.pixmap = dst;
2946	tmp.dst.width  = dst->drawable.width;
2947	tmp.dst.height = dst->drawable.height;
2948	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
2949	tmp.dst.bo = bo;
2950	tmp.dst.x = tmp.dst.y = 0;
2951
2952	tmp.src.bo =
2953		sna_render_get_solid(sna,
2954				     sna_rgba_for_color(color,
2955							dst->drawable.depth));
2956	tmp.src.filter = SAMPLER_FILTER_NEAREST;
2957	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
2958
2959	tmp.mask.bo = NULL;
2960	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
2961	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
2962
2963	tmp.is_affine = true;
2964	tmp.floats_per_vertex = 2;
2965	tmp.floats_per_rect = 6;
2966	tmp.has_component_alpha = 0;
2967	tmp.need_magic_ca_pass = false;
2968
2969	tmp.u.gen5.wm_kernel = WM_KERNEL;
2970	tmp.u.gen5.ve_id = 1;
2971
2972	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
2973		kgem_submit(&sna->kgem);
2974		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
2975			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2976			return false;
2977		}
2978		assert(kgem_check_bo(&sna->kgem, bo, NULL));
2979	}
2980
2981	gen5_fill_bind_surfaces(sna, &tmp);
2982	gen5_align_vertex(sna, &tmp);
2983
2984	gen5_get_rectangles(sna, &tmp, 1, gen5_fill_bind_surfaces);
2985
2986	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
2987	OUT_VERTEX(x2, y2);
2988	OUT_VERTEX_F(.5);
2989
2990	OUT_VERTEX(x1, y2);
2991	OUT_VERTEX_F(.5);
2992
2993	OUT_VERTEX(x1, y1);
2994	OUT_VERTEX_F(.5);
2995
2996	gen4_vertex_flush(sna);
2997	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2998
2999	return true;
3000}
3001
3002static void
3003gen5_render_flush(struct sna *sna)
3004{
3005	gen4_vertex_close(sna);
3006
3007	assert(sna->render.vb_id == 0);
3008	assert(sna->render.vertex_offset == 0);
3009}
3010
3011static void
3012gen5_render_context_switch(struct kgem *kgem,
3013			   int new_mode)
3014{
3015	if (!kgem->nbatch)
3016		return;
3017
3018	/* WaNonPipelinedStateCommandFlush
3019	 *
3020	 * Ironlake has a limitation that a 3D or Media command can't
3021	 * be the first command after a BLT, unless it's
3022	 * non-pipelined.
3023	 *
3024	 * We do this by ensuring that the non-pipelined drawrect
3025	 * is always emitted first following a switch from BLT.
3026	 */
3027	if (kgem->mode == KGEM_BLT) {
3028		struct sna *sna = to_sna_from_kgem(kgem);
3029		DBG(("%s: forcing drawrect on next state emission\n",
3030		     __FUNCTION__));
3031		sna->render_state.gen5.drawrect_limit = -1;
3032	}
3033
3034	if (kgem_ring_is_idle(kgem, kgem->ring)) {
3035		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
3036		_kgem_submit(kgem);
3037	}
3038}
3039
3040static void
3041discard_vbo(struct sna *sna)
3042{
3043	kgem_bo_destroy(&sna->kgem, sna->render.vbo);
3044	sna->render.vbo = NULL;
3045	sna->render.vertices = sna->render.vertex_data;
3046	sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
3047	sna->render.vertex_used = 0;
3048	sna->render.vertex_index = 0;
3049}
3050
3051static void
3052gen5_render_retire(struct kgem *kgem)
3053{
3054	struct sna *sna;
3055
3056	sna = container_of(kgem, struct sna, kgem);
3057	if (kgem->nbatch == 0 && sna->render.vbo && !kgem_bo_is_busy(sna->render.vbo)) {
3058		DBG(("%s: resetting idle vbo\n", __FUNCTION__));
3059		sna->render.vertex_used = 0;
3060		sna->render.vertex_index = 0;
3061	}
3062}
3063
3064static void
3065gen5_render_expire(struct kgem *kgem)
3066{
3067	struct sna *sna;
3068
3069	sna = container_of(kgem, struct sna, kgem);
3070	if (sna->render.vbo && !sna->render.vertex_used) {
3071		DBG(("%s: discarding vbo\n", __FUNCTION__));
3072		discard_vbo(sna);
3073	}
3074}
3075
3076static void gen5_render_reset(struct sna *sna)
3077{
3078	sna->render_state.gen5.needs_invariant = true;
3079	sna->render_state.gen5.ve_id = -1;
3080	sna->render_state.gen5.last_primitive = -1;
3081	sna->render_state.gen5.last_pipelined_pointers = 0;
3082
3083	sna->render_state.gen5.drawrect_offset = -1;
3084	sna->render_state.gen5.drawrect_limit = -1;
3085	sna->render_state.gen5.surface_table = -1;
3086
3087	if (sna->render.vbo &&
3088	    !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
3089		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
3090		discard_vbo(sna);
3091	}
3092
3093	sna->render.vertex_offset = 0;
3094	sna->render.nvertex_reloc = 0;
3095	sna->render.vb_id = 0;
3096}
3097
3098static void gen5_render_fini(struct sna *sna)
3099{
3100	kgem_bo_destroy(&sna->kgem, sna->render_state.gen5.general_bo);
3101}
3102
3103static uint32_t gen5_create_vs_unit_state(struct sna_static_stream *stream)
3104{
3105	struct gen5_vs_unit_state *vs = sna_static_stream_map(stream, sizeof(*vs), 32);
3106
3107	/* Set up the vertex shader to be disabled (passthrough) */
3108	vs->thread4.nr_urb_entries = URB_VS_ENTRIES >> 2;
3109	vs->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
3110	vs->vs6.vs_enable = 0;
3111	vs->vs6.vert_cache_disable = 1;
3112
3113	return sna_static_stream_offsetof(stream, vs);
3114}
3115
3116static uint32_t gen5_create_sf_state(struct sna_static_stream *stream,
3117				     uint32_t kernel)
3118{
3119	struct gen5_sf_unit_state *sf_state;
3120
3121	sf_state = sna_static_stream_map(stream, sizeof(*sf_state), 32);
3122
3123	sf_state->thread0.grf_reg_count = GEN5_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
3124	sf_state->thread0.kernel_start_pointer = kernel >> 6;
3125
3126	sf_state->thread3.const_urb_entry_read_length = 0;	/* no const URBs */
3127	sf_state->thread3.const_urb_entry_read_offset = 0;	/* no const URBs */
3128	sf_state->thread3.urb_entry_read_length = 1;	/* 1 URB per vertex */
3129	/* don't smash vertex header, read start from dw8 */
3130	sf_state->thread3.urb_entry_read_offset = 1;
3131	sf_state->thread3.dispatch_grf_start_reg = 3;
3132	sf_state->thread4.max_threads = SF_MAX_THREADS - 1;
3133	sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
3134	sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES;
3135	sf_state->sf5.viewport_transform = false;	/* skip viewport */
3136	sf_state->sf6.cull_mode = GEN5_CULLMODE_NONE;
3137	sf_state->sf6.scissor = 0;
3138	sf_state->sf7.trifan_pv = 2;
3139	sf_state->sf6.dest_org_vbias = 0x8;
3140	sf_state->sf6.dest_org_hbias = 0x8;
3141
3142	return sna_static_stream_offsetof(stream, sf_state);
3143}
3144
3145static uint32_t gen5_create_sampler_state(struct sna_static_stream *stream,
3146					  sampler_filter_t src_filter,
3147					  sampler_extend_t src_extend,
3148					  sampler_filter_t mask_filter,
3149					  sampler_extend_t mask_extend)
3150{
3151	struct gen5_sampler_state *sampler_state;
3152
3153	sampler_state = sna_static_stream_map(stream,
3154					      sizeof(struct gen5_sampler_state) * 2,
3155					      32);
3156	sampler_state_init(&sampler_state[0], src_filter, src_extend);
3157	sampler_state_init(&sampler_state[1], mask_filter, mask_extend);
3158
3159	return sna_static_stream_offsetof(stream, sampler_state);
3160}
3161
3162static void gen5_init_wm_state(struct gen5_wm_unit_state *state,
3163			       bool has_mask,
3164			       uint32_t kernel,
3165			       uint32_t sampler)
3166{
3167	state->thread0.grf_reg_count = GEN5_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
3168	state->thread0.kernel_start_pointer = kernel >> 6;
3169
3170	state->thread1.single_program_flow = 0;
3171
3172	/* scratch space is not used in our kernel */
3173	state->thread2.scratch_space_base_pointer = 0;
3174	state->thread2.per_thread_scratch_space = 0;
3175
3176	state->thread3.const_urb_entry_read_length = 0;
3177	state->thread3.const_urb_entry_read_offset = 0;
3178
3179	state->thread3.urb_entry_read_offset = 0;
3180	/* wm kernel use urb from 3, see wm_program in compiler module */
3181	state->thread3.dispatch_grf_start_reg = 3;	/* must match kernel */
3182
3183	state->wm4.sampler_count = 0;	/* hardware requirement */
3184
3185	state->wm4.sampler_state_pointer = sampler >> 5;
3186	state->wm5.max_threads = PS_MAX_THREADS - 1;
3187	state->wm5.transposed_urb_read = 0;
3188	state->wm5.thread_dispatch_enable = 1;
3189	/* just use 16-pixel dispatch (4 subspans), don't need to change kernel
3190	 * start point
3191	 */
3192	state->wm5.enable_16_pix = 1;
3193	state->wm5.enable_8_pix = 0;
3194	state->wm5.early_depth_test = 1;
3195
3196	/* Each pair of attributes (src/mask coords) is two URB entries */
3197	if (has_mask) {
3198		state->thread1.binding_table_entry_count = 3;	/* 2 tex and fb */
3199		state->thread3.urb_entry_read_length = 4;
3200	} else {
3201		state->thread1.binding_table_entry_count = 2;	/* 1 tex and fb */
3202		state->thread3.urb_entry_read_length = 2;
3203	}
3204
3205	/* binding table entry count is only used for prefetching,
3206	 * and it has to be set 0 for Ironlake
3207	 */
3208	state->thread1.binding_table_entry_count = 0;
3209}
3210
3211static uint32_t gen5_create_cc_unit_state(struct sna_static_stream *stream)
3212{
3213	uint8_t *ptr, *base;
3214	int i, j;
3215
3216	base = ptr =
3217		sna_static_stream_map(stream,
3218				      GEN5_BLENDFACTOR_COUNT*GEN5_BLENDFACTOR_COUNT*64,
3219				      64);
3220
3221	for (i = 0; i < GEN5_BLENDFACTOR_COUNT; i++) {
3222		for (j = 0; j < GEN5_BLENDFACTOR_COUNT; j++) {
3223			struct gen5_cc_unit_state *state =
3224				(struct gen5_cc_unit_state *)ptr;
3225
3226			state->cc3.blend_enable =
3227				!(j == GEN5_BLENDFACTOR_ZERO && i == GEN5_BLENDFACTOR_ONE);
3228
3229			state->cc5.logicop_func = 0xc;	/* COPY */
3230			state->cc5.ia_blend_function = GEN5_BLENDFUNCTION_ADD;
3231
3232			/* Fill in alpha blend factors same as color, for the future. */
3233			state->cc5.ia_src_blend_factor = i;
3234			state->cc5.ia_dest_blend_factor = j;
3235
3236			state->cc6.blend_function = GEN5_BLENDFUNCTION_ADD;
3237			state->cc6.clamp_post_alpha_blend = 1;
3238			state->cc6.clamp_pre_alpha_blend = 1;
3239			state->cc6.src_blend_factor = i;
3240			state->cc6.dest_blend_factor = j;
3241
3242			ptr += 64;
3243		}
3244	}
3245
3246	return sna_static_stream_offsetof(stream, base);
3247}
3248
3249static bool gen5_render_setup(struct sna *sna)
3250{
3251	struct gen5_render_state *state = &sna->render_state.gen5;
3252	struct sna_static_stream general;
3253	struct gen5_wm_unit_state_padded *wm_state;
3254	uint32_t sf[2], wm[KERNEL_COUNT];
3255	int i, j, k, l, m;
3256
3257	sna_static_stream_init(&general);
3258
3259	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
3260	 * dumps, you know it points to zero.
3261	 */
3262	null_create(&general);
3263
3264	/* Set up the two SF states (one for blending with a mask, one without) */
3265	sf[0] = sna_static_stream_compile_sf(sna, &general, brw_sf_kernel__nomask);
3266	sf[1] = sna_static_stream_compile_sf(sna, &general, brw_sf_kernel__mask);
3267
3268	for (m = 0; m < KERNEL_COUNT; m++) {
3269		if (wm_kernels[m].size) {
3270			wm[m] = sna_static_stream_add(&general,
3271						      wm_kernels[m].data,
3272						      wm_kernels[m].size,
3273						      64);
3274		} else {
3275			wm[m] = sna_static_stream_compile_wm(sna, &general,
3276							     wm_kernels[m].data,
3277							     16);
3278		}
3279		assert(wm[m]);
3280	}
3281
3282	state->vs = gen5_create_vs_unit_state(&general);
3283
3284	state->sf[0] = gen5_create_sf_state(&general, sf[0]);
3285	state->sf[1] = gen5_create_sf_state(&general, sf[1]);
3286
3287
3288	/* Set up the WM states: each filter/extend type for source and mask, per
3289	 * kernel.
3290	 */
3291	wm_state = sna_static_stream_map(&general,
3292					  sizeof(*wm_state) * KERNEL_COUNT *
3293					  FILTER_COUNT * EXTEND_COUNT *
3294					  FILTER_COUNT * EXTEND_COUNT,
3295					  64);
3296	state->wm = sna_static_stream_offsetof(&general, wm_state);
3297	for (i = 0; i < FILTER_COUNT; i++) {
3298		for (j = 0; j < EXTEND_COUNT; j++) {
3299			for (k = 0; k < FILTER_COUNT; k++) {
3300				for (l = 0; l < EXTEND_COUNT; l++) {
3301					uint32_t sampler_state;
3302
3303					sampler_state =
3304						gen5_create_sampler_state(&general,
3305									  i, j,
3306									  k, l);
3307
3308					for (m = 0; m < KERNEL_COUNT; m++) {
3309						gen5_init_wm_state(&wm_state->state,
3310								   wm_kernels[m].has_mask,
3311								   wm[m], sampler_state);
3312						wm_state++;
3313					}
3314				}
3315			}
3316		}
3317	}
3318
3319	state->cc = gen5_create_cc_unit_state(&general);
3320
3321	state->general_bo = sna_static_stream_fini(sna, &general);
3322	return state->general_bo != NULL;
3323}
3324
3325const char *gen5_render_init(struct sna *sna, const char *backend)
3326{
3327	if (!gen5_render_setup(sna))
3328		return backend;
3329
3330	sna->kgem.context_switch = gen5_render_context_switch;
3331	sna->kgem.retire = gen5_render_retire;
3332	sna->kgem.expire = gen5_render_expire;
3333
3334#if !NO_COMPOSITE
3335	sna->render.composite = gen5_render_composite;
3336	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
3337#endif
3338#if !NO_COMPOSITE_SPANS
3339	sna->render.check_composite_spans = gen5_check_composite_spans;
3340	sna->render.composite_spans = gen5_render_composite_spans;
3341	if (sna->PciInfo->device_id == 0x0044)
3342		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
3343#endif
3344	sna->render.video = gen5_render_video;
3345
3346	sna->render.copy_boxes = gen5_render_copy_boxes;
3347	sna->render.copy = gen5_render_copy;
3348
3349	sna->render.fill_boxes = gen5_render_fill_boxes;
3350	sna->render.fill = gen5_render_fill;
3351	sna->render.fill_one = gen5_render_fill_one;
3352
3353	sna->render.flush = gen5_render_flush;
3354	sna->render.reset = gen5_render_reset;
3355	sna->render.fini = gen5_render_fini;
3356
3357	sna->render.max_3d_size = MAX_3D_SIZE;
3358	sna->render.max_3d_pitch = 1 << 18;
3359	return "Ironlake (gen5)";
3360}
3361