1/*
2 * Copyright © 2006,2008,2011 Intel Corporation
3 * Copyright © 2007 Red Hat, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 *    Wang Zhenyu <zhenyu.z.wang@sna.com>
26 *    Eric Anholt <eric@anholt.net>
27 *    Carl Worth <cworth@redhat.com>
28 *    Keith Packard <keithp@keithp.com>
29 *    Chris Wilson <chris@chris-wilson.co.uk>
30 *
31 */
32
33#ifdef HAVE_CONFIG_H
34#include "config.h"
35#endif
36
37#include "sna.h"
38#include "sna_reg.h"
39#include "sna_render.h"
40#include "sna_render_inline.h"
41#include "sna_video.h"
42
43#include "brw/brw.h"
44#include "gen4_common.h"
45#include "gen4_render.h"
46#include "gen4_source.h"
47#include "gen4_vertex.h"
48
49/* gen4 has a serious issue with its shaders that we need to flush
50 * after every rectangle... So until that is resolved, prefer
51 * the BLT engine.
52 */
53#define FORCE_SPANS 0
54#define FORCE_NONRECTILINEAR_SPANS -1
55#define FORCE_FLUSH 1 /* https://bugs.freedesktop.org/show_bug.cgi?id=55500 */
56
57#define ALWAYS_FLUSH 1
58
59#define NO_COMPOSITE 0
60#define NO_COMPOSITE_SPANS 0
61#define NO_COPY 0
62#define NO_COPY_BOXES 0
63#define NO_FILL 0
64#define NO_FILL_ONE 0
65#define NO_FILL_BOXES 0
66#define NO_VIDEO 0
67
68#define MAX_FLUSH_VERTICES 1 /* was 6, https://bugs.freedesktop.org/show_bug.cgi?id=55500 */
69
70#define GEN4_GRF_BLOCKS(nreg)    ((nreg + 15) / 16 - 1)
71
72/* Set up a default static partitioning of the URB, which is supposed to
73 * allow anything we would want to do, at potentially lower performance.
74 */
75#define URB_CS_ENTRY_SIZE     1
76#define URB_CS_ENTRIES	      0
77
78#define URB_VS_ENTRY_SIZE     1
79#define URB_VS_ENTRIES	      32
80
81#define URB_GS_ENTRY_SIZE     0
82#define URB_GS_ENTRIES	      0
83
84#define URB_CL_ENTRY_SIZE   0
85#define URB_CL_ENTRIES      0
86
87#define URB_SF_ENTRY_SIZE     2
88#define URB_SF_ENTRIES	      64
89
90/*
91 * this program computes dA/dx and dA/dy for the texture coordinates along
92 * with the base texture coordinate. It was extracted from the Mesa driver
93 */
94
95#define SF_KERNEL_NUM_GRF 16
96#define PS_KERNEL_NUM_GRF 32
97
98#define GEN4_MAX_SF_THREADS 24
99#define GEN4_MAX_WM_THREADS 32
100#define G4X_MAX_WM_THREADS 50
101
102static const uint32_t ps_kernel_packed_static[][4] = {
103#include "exa_wm_xy.g4b"
104#include "exa_wm_src_affine.g4b"
105#include "exa_wm_src_sample_argb.g4b"
106#include "exa_wm_yuv_rgb.g4b"
107#include "exa_wm_write.g4b"
108};
109
110static const uint32_t ps_kernel_planar_static[][4] = {
111#include "exa_wm_xy.g4b"
112#include "exa_wm_src_affine.g4b"
113#include "exa_wm_src_sample_planar.g4b"
114#include "exa_wm_yuv_rgb.g4b"
115#include "exa_wm_write.g4b"
116};
117
118#define NOKERNEL(kernel_enum, func, masked) \
119    [kernel_enum] = {func, 0, masked}
120#define KERNEL(kernel_enum, kernel, masked) \
121    [kernel_enum] = {&kernel, sizeof(kernel), masked}
122static const struct wm_kernel_info {
123	const void *data;
124	unsigned int size;
125	bool has_mask;
126} wm_kernels[] = {
127	NOKERNEL(WM_KERNEL, brw_wm_kernel__affine, false),
128	NOKERNEL(WM_KERNEL_P, brw_wm_kernel__projective, false),
129
130	NOKERNEL(WM_KERNEL_MASK, brw_wm_kernel__affine_mask, true),
131	NOKERNEL(WM_KERNEL_MASK_P, brw_wm_kernel__projective_mask, true),
132
133	NOKERNEL(WM_KERNEL_MASKCA, brw_wm_kernel__affine_mask_ca, true),
134	NOKERNEL(WM_KERNEL_MASKCA_P, brw_wm_kernel__projective_mask_ca, true),
135
136	NOKERNEL(WM_KERNEL_MASKSA, brw_wm_kernel__affine_mask_sa, true),
137	NOKERNEL(WM_KERNEL_MASKSA_P, brw_wm_kernel__projective_mask_sa, true),
138
139	NOKERNEL(WM_KERNEL_OPACITY, brw_wm_kernel__affine_opacity, true),
140	NOKERNEL(WM_KERNEL_OPACITY_P, brw_wm_kernel__projective_opacity, true),
141
142	KERNEL(WM_KERNEL_VIDEO_PLANAR, ps_kernel_planar_static, false),
143	KERNEL(WM_KERNEL_VIDEO_PACKED, ps_kernel_packed_static, false),
144};
145#undef KERNEL
146
147static const struct blendinfo {
148	bool src_alpha;
149	uint32_t src_blend;
150	uint32_t dst_blend;
151} gen4_blend_op[] = {
152	/* Clear */	{0, GEN4_BLENDFACTOR_ZERO, GEN4_BLENDFACTOR_ZERO},
153	/* Src */	{0, GEN4_BLENDFACTOR_ONE, GEN4_BLENDFACTOR_ZERO},
154	/* Dst */	{0, GEN4_BLENDFACTOR_ZERO, GEN4_BLENDFACTOR_ONE},
155	/* Over */	{1, GEN4_BLENDFACTOR_ONE, GEN4_BLENDFACTOR_INV_SRC_ALPHA},
156	/* OverReverse */ {0, GEN4_BLENDFACTOR_INV_DST_ALPHA, GEN4_BLENDFACTOR_ONE},
157	/* In */	{0, GEN4_BLENDFACTOR_DST_ALPHA, GEN4_BLENDFACTOR_ZERO},
158	/* InReverse */	{1, GEN4_BLENDFACTOR_ZERO, GEN4_BLENDFACTOR_SRC_ALPHA},
159	/* Out */	{0, GEN4_BLENDFACTOR_INV_DST_ALPHA, GEN4_BLENDFACTOR_ZERO},
160	/* OutReverse */ {1, GEN4_BLENDFACTOR_ZERO, GEN4_BLENDFACTOR_INV_SRC_ALPHA},
161	/* Atop */	{1, GEN4_BLENDFACTOR_DST_ALPHA, GEN4_BLENDFACTOR_INV_SRC_ALPHA},
162	/* AtopReverse */ {1, GEN4_BLENDFACTOR_INV_DST_ALPHA, GEN4_BLENDFACTOR_SRC_ALPHA},
163	/* Xor */	{1, GEN4_BLENDFACTOR_INV_DST_ALPHA, GEN4_BLENDFACTOR_INV_SRC_ALPHA},
164	/* Add */	{0, GEN4_BLENDFACTOR_ONE, GEN4_BLENDFACTOR_ONE},
165};
166
167/**
168 * Highest-valued BLENDFACTOR used in gen4_blend_op.
169 *
170 * This leaves out GEN4_BLENDFACTOR_INV_DST_COLOR,
171 * GEN4_BLENDFACTOR_INV_CONST_{COLOR,ALPHA},
172 * GEN4_BLENDFACTOR_INV_SRC1_{COLOR,ALPHA}
173 */
174#define GEN4_BLENDFACTOR_COUNT (GEN4_BLENDFACTOR_INV_DST_ALPHA + 1)
175
176#define BLEND_OFFSET(s, d) \
177	(((s) * GEN4_BLENDFACTOR_COUNT + (d)) * 64)
178
179#define SAMPLER_OFFSET(sf, se, mf, me, k) \
180	((((((sf) * EXTEND_COUNT + (se)) * FILTER_COUNT + (mf)) * EXTEND_COUNT + (me)) * KERNEL_COUNT + (k)) * 64)
181
182static void
183gen4_emit_pipelined_pointers(struct sna *sna,
184			     const struct sna_composite_op *op,
185			     int blend, int kernel);
186
187#define OUT_BATCH(v) batch_emit(sna, v)
188#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y)
189#define OUT_VERTEX_F(v) vertex_emit(sna, v)
190
191#define GEN4_MAX_3D_SIZE 8192
192
193static inline bool too_large(int width, int height)
194{
195	return width > GEN4_MAX_3D_SIZE || height > GEN4_MAX_3D_SIZE;
196}
197
198static int
199gen4_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
200{
201	int base;
202
203	if (has_mask) {
204		if (is_ca) {
205			if (gen4_blend_op[op].src_alpha)
206				base = WM_KERNEL_MASKSA;
207			else
208				base = WM_KERNEL_MASKCA;
209		} else
210			base = WM_KERNEL_MASK;
211	} else
212		base = WM_KERNEL;
213
214	return base + !is_affine;
215}
216
217static bool gen4_magic_ca_pass(struct sna *sna,
218			       const struct sna_composite_op *op)
219{
220	struct gen4_render_state *state = &sna->render_state.gen4;
221
222	if (!op->need_magic_ca_pass)
223		return false;
224
225	assert(sna->render.vertex_index > sna->render.vertex_start);
226
227	DBG(("%s: CA fixup\n", __FUNCTION__));
228	assert(op->mask.bo != NULL);
229	assert(op->has_component_alpha);
230
231	gen4_emit_pipelined_pointers(sna, op, PictOpAdd,
232				     gen4_choose_composite_kernel(PictOpAdd,
233								  true, true, op->is_affine));
234
235	OUT_BATCH(GEN4_3DPRIMITIVE |
236		  GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL |
237		  (_3DPRIM_RECTLIST << GEN4_3DPRIMITIVE_TOPOLOGY_SHIFT) |
238		  (0 << 9) |
239		  4);
240	OUT_BATCH(sna->render.vertex_index - sna->render.vertex_start);
241	OUT_BATCH(sna->render.vertex_start);
242	OUT_BATCH(1);	/* single instance */
243	OUT_BATCH(0);	/* start instance location */
244	OUT_BATCH(0);	/* index buffer offset, ignored */
245
246	state->last_primitive = sna->kgem.nbatch;
247	return true;
248}
249
250static uint32_t gen4_get_blend(int op,
251			       bool has_component_alpha,
252			       uint32_t dst_format)
253{
254	uint32_t src, dst;
255
256	src = gen4_blend_op[op].src_blend;
257	dst = gen4_blend_op[op].dst_blend;
258
259	/* If there's no dst alpha channel, adjust the blend op so that we'll treat
260	 * it as always 1.
261	 */
262	if (PICT_FORMAT_A(dst_format) == 0) {
263		if (src == GEN4_BLENDFACTOR_DST_ALPHA)
264			src = GEN4_BLENDFACTOR_ONE;
265		else if (src == GEN4_BLENDFACTOR_INV_DST_ALPHA)
266			src = GEN4_BLENDFACTOR_ZERO;
267	}
268
269	/* If the source alpha is being used, then we should only be in a
270	 * case where the source blend factor is 0, and the source blend
271	 * value is the mask channels multiplied by the source picture's alpha.
272	 */
273	if (has_component_alpha && gen4_blend_op[op].src_alpha) {
274		if (dst == GEN4_BLENDFACTOR_SRC_ALPHA)
275			dst = GEN4_BLENDFACTOR_SRC_COLOR;
276		else if (dst == GEN4_BLENDFACTOR_INV_SRC_ALPHA)
277			dst = GEN4_BLENDFACTOR_INV_SRC_COLOR;
278	}
279
280	DBG(("blend op=%d, dst=%x [A=%d] => src=%d, dst=%d => offset=%x\n",
281	     op, dst_format, PICT_FORMAT_A(dst_format),
282	     src, dst, BLEND_OFFSET(src, dst)));
283	return BLEND_OFFSET(src, dst);
284}
285
286static uint32_t gen4_get_card_format(PictFormat format)
287{
288	switch (format) {
289	default:
290		return -1;
291	case PICT_a8r8g8b8:
292		return GEN4_SURFACEFORMAT_B8G8R8A8_UNORM;
293	case PICT_x8r8g8b8:
294		return GEN4_SURFACEFORMAT_B8G8R8X8_UNORM;
295	case PICT_a8b8g8r8:
296		return GEN4_SURFACEFORMAT_R8G8B8A8_UNORM;
297	case PICT_x8b8g8r8:
298		return GEN4_SURFACEFORMAT_R8G8B8X8_UNORM;
299#ifdef PICT_a2r10g10b10
300	case PICT_a2r10g10b10:
301		return GEN4_SURFACEFORMAT_B10G10R10A2_UNORM;
302	case PICT_x2r10g10b10:
303		return GEN4_SURFACEFORMAT_B10G10R10X2_UNORM;
304#endif
305	case PICT_r8g8b8:
306		return GEN4_SURFACEFORMAT_R8G8B8_UNORM;
307	case PICT_r5g6b5:
308		return GEN4_SURFACEFORMAT_B5G6R5_UNORM;
309	case PICT_a1r5g5b5:
310		return GEN4_SURFACEFORMAT_B5G5R5A1_UNORM;
311	case PICT_a8:
312		return GEN4_SURFACEFORMAT_A8_UNORM;
313	case PICT_a4r4g4b4:
314		return GEN4_SURFACEFORMAT_B4G4R4A4_UNORM;
315	}
316}
317
318static uint32_t gen4_get_dest_format(PictFormat format)
319{
320	switch (format) {
321	default:
322		return -1;
323	case PICT_a8r8g8b8:
324	case PICT_x8r8g8b8:
325		return GEN4_SURFACEFORMAT_B8G8R8A8_UNORM;
326	case PICT_a8b8g8r8:
327	case PICT_x8b8g8r8:
328		return GEN4_SURFACEFORMAT_R8G8B8A8_UNORM;
329#ifdef PICT_a2r10g10b10
330	case PICT_a2r10g10b10:
331	case PICT_x2r10g10b10:
332		return GEN4_SURFACEFORMAT_B10G10R10A2_UNORM;
333#endif
334	case PICT_r5g6b5:
335		return GEN4_SURFACEFORMAT_B5G6R5_UNORM;
336	case PICT_x1r5g5b5:
337	case PICT_a1r5g5b5:
338		return GEN4_SURFACEFORMAT_B5G5R5A1_UNORM;
339	case PICT_a8:
340		return GEN4_SURFACEFORMAT_A8_UNORM;
341	case PICT_a4r4g4b4:
342	case PICT_x4r4g4b4:
343		return GEN4_SURFACEFORMAT_B4G4R4A4_UNORM;
344	}
345}
346
347static bool gen4_check_dst_format(PictFormat format)
348{
349	if (gen4_get_dest_format(format) != -1)
350		return true;
351
352	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
353	return false;
354}
355
356static bool gen4_check_format(uint32_t format)
357{
358	if (gen4_get_card_format(format) != -1)
359		return true;
360
361	DBG(("%s: unhandled format: %x\n", __FUNCTION__, (int)format));
362	return false;
363}
364
365typedef struct gen4_surface_state_padded {
366	struct gen4_surface_state state;
367	char pad[32 - sizeof(struct gen4_surface_state)];
368} gen4_surface_state_padded;
369
370static void null_create(struct sna_static_stream *stream)
371{
372	/* A bunch of zeros useful for legacy border color and depth-stencil */
373	sna_static_stream_map(stream, 64, 64);
374}
375
376static void
377sampler_state_init(struct gen4_sampler_state *sampler_state,
378		   sampler_filter_t filter,
379		   sampler_extend_t extend)
380{
381	sampler_state->ss0.lod_preclamp = 1;	/* GL mode */
382
383	/* We use the legacy mode to get the semantics specified by
384	 * the Render extension. */
385	sampler_state->ss0.border_color_mode = GEN4_BORDER_COLOR_MODE_LEGACY;
386
387	switch (filter) {
388	default:
389	case SAMPLER_FILTER_NEAREST:
390		sampler_state->ss0.min_filter = GEN4_MAPFILTER_NEAREST;
391		sampler_state->ss0.mag_filter = GEN4_MAPFILTER_NEAREST;
392		break;
393	case SAMPLER_FILTER_BILINEAR:
394		sampler_state->ss0.min_filter = GEN4_MAPFILTER_LINEAR;
395		sampler_state->ss0.mag_filter = GEN4_MAPFILTER_LINEAR;
396		break;
397	}
398
399	switch (extend) {
400	default:
401	case SAMPLER_EXTEND_NONE:
402		sampler_state->ss1.r_wrap_mode = GEN4_TEXCOORDMODE_CLAMP_BORDER;
403		sampler_state->ss1.s_wrap_mode = GEN4_TEXCOORDMODE_CLAMP_BORDER;
404		sampler_state->ss1.t_wrap_mode = GEN4_TEXCOORDMODE_CLAMP_BORDER;
405		break;
406	case SAMPLER_EXTEND_REPEAT:
407		sampler_state->ss1.r_wrap_mode = GEN4_TEXCOORDMODE_WRAP;
408		sampler_state->ss1.s_wrap_mode = GEN4_TEXCOORDMODE_WRAP;
409		sampler_state->ss1.t_wrap_mode = GEN4_TEXCOORDMODE_WRAP;
410		break;
411	case SAMPLER_EXTEND_PAD:
412		sampler_state->ss1.r_wrap_mode = GEN4_TEXCOORDMODE_CLAMP;
413		sampler_state->ss1.s_wrap_mode = GEN4_TEXCOORDMODE_CLAMP;
414		sampler_state->ss1.t_wrap_mode = GEN4_TEXCOORDMODE_CLAMP;
415		break;
416	case SAMPLER_EXTEND_REFLECT:
417		sampler_state->ss1.r_wrap_mode = GEN4_TEXCOORDMODE_MIRROR;
418		sampler_state->ss1.s_wrap_mode = GEN4_TEXCOORDMODE_MIRROR;
419		sampler_state->ss1.t_wrap_mode = GEN4_TEXCOORDMODE_MIRROR;
420		break;
421	}
422}
423
424static uint32_t gen4_filter(uint32_t filter)
425{
426	switch (filter) {
427	default:
428		assert(0);
429	case PictFilterNearest:
430		return SAMPLER_FILTER_NEAREST;
431	case PictFilterBilinear:
432		return SAMPLER_FILTER_BILINEAR;
433	}
434}
435
436static uint32_t gen4_check_filter(PicturePtr picture)
437{
438	switch (picture->filter) {
439	case PictFilterNearest:
440	case PictFilterBilinear:
441		return true;
442	default:
443		DBG(("%s: unknown filter: %s [%d]\n",
444		     __FUNCTION__,
445		     PictureGetFilterName(picture->filter),
446		     picture->filter));
447		return false;
448	}
449}
450
451static uint32_t gen4_repeat(uint32_t repeat)
452{
453	switch (repeat) {
454	default:
455		assert(0);
456	case RepeatNone:
457		return SAMPLER_EXTEND_NONE;
458	case RepeatNormal:
459		return SAMPLER_EXTEND_REPEAT;
460	case RepeatPad:
461		return SAMPLER_EXTEND_PAD;
462	case RepeatReflect:
463		return SAMPLER_EXTEND_REFLECT;
464	}
465}
466
467static bool gen4_check_repeat(PicturePtr picture)
468{
469	if (!picture->repeat)
470		return true;
471
472	switch (picture->repeatType) {
473	case RepeatNone:
474	case RepeatNormal:
475	case RepeatPad:
476	case RepeatReflect:
477		return true;
478	default:
479		DBG(("%s: unknown repeat: %d\n",
480		     __FUNCTION__, picture->repeatType));
481		return false;
482	}
483}
484
485static uint32_t
486gen4_tiling_bits(uint32_t tiling)
487{
488	switch (tiling) {
489	default: assert(0);
490	case I915_TILING_NONE: return 0;
491	case I915_TILING_X: return GEN4_SURFACE_TILED;
492	case I915_TILING_Y: return GEN4_SURFACE_TILED | GEN4_SURFACE_TILED_Y;
493	}
494}
495
496/**
497 * Sets up the common fields for a surface state buffer for the given
498 * picture in the given surface state buffer.
499 */
500static uint32_t
501gen4_bind_bo(struct sna *sna,
502	     struct kgem_bo *bo,
503	     uint32_t width,
504	     uint32_t height,
505	     uint32_t format,
506	     bool is_dst)
507{
508	uint32_t domains;
509	uint16_t offset;
510	uint32_t *ss;
511
512	assert(sna->kgem.gen != 040 || !kgem_bo_is_snoop(bo));
513
514	/* After the first bind, we manage the cache domains within the batch */
515	offset = kgem_bo_get_binding(bo, format | is_dst << 31);
516	if (offset) {
517		assert(offset >= sna->kgem.surface);
518		if (is_dst)
519			kgem_bo_mark_dirty(bo);
520		return offset * sizeof(uint32_t);
521	}
522
523	offset = sna->kgem.surface -=
524		sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
525	ss = sna->kgem.batch + offset;
526
527	ss[0] = (GEN4_SURFACE_2D << GEN4_SURFACE_TYPE_SHIFT |
528		 GEN4_SURFACE_BLEND_ENABLED |
529		 format << GEN4_SURFACE_FORMAT_SHIFT);
530
531	if (is_dst) {
532		ss[0] |= GEN4_SURFACE_RC_READ_WRITE;
533		domains = I915_GEM_DOMAIN_RENDER << 16 | I915_GEM_DOMAIN_RENDER;
534	} else
535		domains = I915_GEM_DOMAIN_SAMPLER << 16;
536	ss[1] = kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0);
537
538	ss[2] = ((width - 1)  << GEN4_SURFACE_WIDTH_SHIFT |
539		 (height - 1) << GEN4_SURFACE_HEIGHT_SHIFT);
540	ss[3] = (gen4_tiling_bits(bo->tiling) |
541		 (bo->pitch - 1) << GEN4_SURFACE_PITCH_SHIFT);
542	ss[4] = 0;
543	ss[5] = 0;
544
545	kgem_bo_set_binding(bo, format | is_dst << 31, offset);
546
547	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
548	     offset, bo->handle, ss[1],
549	     format, width, height, bo->pitch, bo->tiling,
550	     domains & 0xffff ? "render" : "sampler"));
551
552	return offset * sizeof(uint32_t);
553}
554
555static void gen4_emit_vertex_buffer(struct sna *sna,
556				    const struct sna_composite_op *op)
557{
558	int id = op->u.gen4.ve_id;
559
560	assert((sna->render.vb_id & (1 << id)) == 0);
561
562	OUT_BATCH(GEN4_3DSTATE_VERTEX_BUFFERS | 3);
563	OUT_BATCH((id << VB0_BUFFER_INDEX_SHIFT) | VB0_VERTEXDATA |
564		  (4*op->floats_per_vertex << VB0_BUFFER_PITCH_SHIFT));
565	assert(sna->render.nvertex_reloc < ARRAY_SIZE(sna->render.vertex_reloc));
566	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
567	OUT_BATCH(0);
568	OUT_BATCH(0);
569	OUT_BATCH(0);
570
571	sna->render.vb_id |= 1 << id;
572}
573
574inline static void
575gen4_emit_pipe_flush(struct sna *sna)
576{
577#if 1
578	OUT_BATCH(GEN4_PIPE_CONTROL |
579		  GEN4_PIPE_CONTROL_WC_FLUSH |
580		  (4 - 2));
581	OUT_BATCH(0);
582	OUT_BATCH(0);
583	OUT_BATCH(0);
584#else
585	OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
586#endif
587}
588
589inline static void
590gen4_emit_pipe_break(struct sna *sna)
591{
592#if !ALWAYS_FLUSH
593	OUT_BATCH(GEN4_PIPE_CONTROL | (4 - 2));
594	OUT_BATCH(0);
595	OUT_BATCH(0);
596	OUT_BATCH(0);
597#else
598	OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
599#endif
600}
601
602inline static void
603gen4_emit_pipe_invalidate(struct sna *sna)
604{
605#if 0
606	OUT_BATCH(GEN4_PIPE_CONTROL |
607		  GEN4_PIPE_CONTROL_WC_FLUSH |
608		  (sna->kgem.gen >= 045 ? GEN4_PIPE_CONTROL_TC_FLUSH : 0) |
609		  (4 - 2));
610	OUT_BATCH(0);
611	OUT_BATCH(0);
612	OUT_BATCH(0);
613#else
614	OUT_BATCH(MI_FLUSH);
615#endif
616}
617
618static void gen4_emit_primitive(struct sna *sna)
619{
620	if (sna->kgem.nbatch == sna->render_state.gen4.last_primitive) {
621		sna->render.vertex_offset = sna->kgem.nbatch - 5;
622		return;
623	}
624
625	OUT_BATCH(GEN4_3DPRIMITIVE |
626		  GEN4_3DPRIMITIVE_VERTEX_SEQUENTIAL |
627		  (_3DPRIM_RECTLIST << GEN4_3DPRIMITIVE_TOPOLOGY_SHIFT) |
628		  (0 << 9) |
629		  4);
630	sna->render.vertex_offset = sna->kgem.nbatch;
631	OUT_BATCH(0);	/* vertex count, to be filled in later */
632	OUT_BATCH(sna->render.vertex_index);
633	OUT_BATCH(1);	/* single instance */
634	OUT_BATCH(0);	/* start instance location */
635	OUT_BATCH(0);	/* index buffer offset, ignored */
636	sna->render.vertex_start = sna->render.vertex_index;
637
638	sna->render_state.gen4.last_primitive = sna->kgem.nbatch;
639}
640
641static bool gen4_rectangle_begin(struct sna *sna,
642				 const struct sna_composite_op *op)
643{
644	unsigned int id = 1 << op->u.gen4.ve_id;
645	int ndwords;
646
647	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
648		return true;
649
650	/* 7xpipelined pointers + 6xprimitive + 1xflush */
651	ndwords = op->need_magic_ca_pass? 19 : 6;
652	if ((sna->render.vb_id & id) == 0)
653		ndwords += 5;
654	ndwords += 8*FORCE_FLUSH;
655
656	if (!kgem_check_batch(&sna->kgem, ndwords))
657		return false;
658
659	if ((sna->render.vb_id & id) == 0)
660		gen4_emit_vertex_buffer(sna, op);
661	if (sna->render.vertex_offset == 0)
662		gen4_emit_primitive(sna);
663
664	return true;
665}
666
667static int gen4_get_rectangles__flush(struct sna *sna,
668				      const struct sna_composite_op *op)
669{
670	/* Preventing discarding new vbo after lock contention */
671	if (sna_vertex_wait__locked(&sna->render)) {
672		int rem = vertex_space(sna);
673		if (rem > op->floats_per_rect)
674			return rem;
675	}
676
677	if (!kgem_check_batch(&sna->kgem,
678			      8*FORCE_FLUSH + (op->need_magic_ca_pass ? 2*19+6 : 6)))
679		return 0;
680	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
681		return 0;
682
683	if (sna->render.vertex_offset) {
684		gen4_vertex_flush(sna);
685		if (gen4_magic_ca_pass(sna, op))
686			gen4_emit_pipelined_pointers(sna, op, op->op,
687						     op->u.gen4.wm_kernel);
688	}
689
690	return gen4_vertex_finish(sna);
691}
692
693inline static int gen4_get_rectangles(struct sna *sna,
694				      const struct sna_composite_op *op,
695				      int want,
696				      void (*emit_state)(struct sna *sna, const struct sna_composite_op *op))
697{
698	int rem;
699
700	assert(want);
701#if FORCE_FLUSH
702	rem = sna->render.vertex_offset;
703	if (sna->kgem.nbatch == sna->render_state.gen4.last_primitive)
704		rem = sna->kgem.nbatch - 5;
705	if (rem) {
706		rem = MAX_FLUSH_VERTICES - (sna->render.vertex_index - sna->render.vertex_start) / 3;
707		if (rem <= 0) {
708			if (sna->render.vertex_offset) {
709				gen4_vertex_flush(sna);
710				if (gen4_magic_ca_pass(sna, op)) {
711					if (kgem_check_batch(&sna->kgem, 19+6))
712						gen4_emit_pipelined_pointers(sna, op, op->op,
713									     op->u.gen4.wm_kernel);
714				}
715			}
716			gen4_emit_pipe_break(sna);
717			rem = MAX_FLUSH_VERTICES;
718		}
719	} else
720		rem = MAX_FLUSH_VERTICES;
721	if (want > rem)
722		want = rem;
723#endif
724
725start:
726	rem = vertex_space(sna);
727	if (unlikely(rem < op->floats_per_rect)) {
728		DBG(("flushing vbo for %s: %d < %d\n",
729		     __FUNCTION__, rem, op->floats_per_rect));
730		rem = gen4_get_rectangles__flush(sna, op);
731		if (unlikely(rem == 0))
732			goto flush;
733	}
734
735	if (unlikely(sna->render.vertex_offset == 0)) {
736		if (!gen4_rectangle_begin(sna, op))
737			goto flush;
738		else
739			goto start;
740	}
741
742	assert(rem <= vertex_space(sna));
743	assert(op->floats_per_rect <= rem);
744	if (want > 1 && want * op->floats_per_rect > rem)
745		want = rem / op->floats_per_rect;
746
747	sna->render.vertex_index += 3*want;
748	return want;
749
750flush:
751	if (sna->render.vertex_offset) {
752		gen4_vertex_flush(sna);
753		gen4_magic_ca_pass(sna, op);
754	}
755	sna_vertex_wait__locked(&sna->render);
756	_kgem_submit(&sna->kgem);
757	emit_state(sna, op);
758	goto start;
759}
760
761static uint32_t *
762gen4_composite_get_binding_table(struct sna *sna, uint16_t *offset)
763{
764	sna->kgem.surface -=
765		sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
766
767	DBG(("%s(%x)\n", __FUNCTION__, 4*sna->kgem.surface));
768
769	/* Clear all surplus entries to zero in case of prefetch */
770	*offset = sna->kgem.surface;
771	return memset(sna->kgem.batch + sna->kgem.surface,
772		      0, sizeof(struct gen4_surface_state_padded));
773}
774
775static void
776gen4_emit_urb(struct sna *sna)
777{
778	int urb_vs_end;
779	int urb_gs_end;
780	int urb_cl_end;
781	int urb_sf_end;
782	int urb_cs_end;
783
784	if (!sna->render_state.gen4.needs_urb)
785		return;
786
787	urb_vs_end =              URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
788	urb_gs_end = urb_vs_end + URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
789	urb_cl_end = urb_gs_end + URB_CL_ENTRIES * URB_CL_ENTRY_SIZE;
790	urb_sf_end = urb_cl_end + URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
791	urb_cs_end = urb_sf_end + URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
792	assert(urb_cs_end <= 256);
793
794	while ((sna->kgem.nbatch & 15) > 12)
795		OUT_BATCH(MI_NOOP);
796
797	OUT_BATCH(GEN4_URB_FENCE |
798		  UF0_CS_REALLOC |
799		  UF0_SF_REALLOC |
800		  UF0_CLIP_REALLOC |
801		  UF0_GS_REALLOC |
802		  UF0_VS_REALLOC |
803		  1);
804	OUT_BATCH(urb_cl_end << UF1_CLIP_FENCE_SHIFT |
805		  urb_gs_end << UF1_GS_FENCE_SHIFT |
806		  urb_vs_end << UF1_VS_FENCE_SHIFT);
807	OUT_BATCH(urb_cs_end << UF2_CS_FENCE_SHIFT |
808		  urb_sf_end << UF2_SF_FENCE_SHIFT);
809
810	/* Constant buffer state */
811	OUT_BATCH(GEN4_CS_URB_STATE | 0);
812	OUT_BATCH((URB_CS_ENTRY_SIZE - 1) << 4 | URB_CS_ENTRIES << 0);
813
814	sna->render_state.gen4.needs_urb = false;
815}
816
817static void
818gen4_emit_state_base_address(struct sna *sna)
819{
820	assert(sna->render_state.gen4.general_bo->proxy == NULL);
821	OUT_BATCH(GEN4_STATE_BASE_ADDRESS | 4);
822	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* general */
823				 sna->kgem.nbatch,
824				 sna->render_state.gen4.general_bo,
825				 I915_GEM_DOMAIN_INSTRUCTION << 16,
826				 BASE_ADDRESS_MODIFY));
827	OUT_BATCH(kgem_add_reloc(&sna->kgem, /* surface */
828				 sna->kgem.nbatch,
829				 NULL,
830				 I915_GEM_DOMAIN_INSTRUCTION << 16,
831				 BASE_ADDRESS_MODIFY));
832	OUT_BATCH(0); /* media */
833
834	/* upper bounds, all disabled */
835	OUT_BATCH(BASE_ADDRESS_MODIFY);
836	OUT_BATCH(0);
837}
838
839static void
840gen4_emit_invariant(struct sna *sna)
841{
842	assert(sna->kgem.surface == sna->kgem.batch_size);
843
844	if (sna->kgem.gen >= 045)
845		OUT_BATCH(NEW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
846	else
847		OUT_BATCH(GEN4_PIPELINE_SELECT | PIPELINE_SELECT_3D);
848
849	gen4_emit_state_base_address(sna);
850
851	sna->render_state.gen4.needs_invariant = false;
852}
853
854static void
855gen4_get_batch(struct sna *sna, const struct sna_composite_op *op)
856{
857	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
858
859	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150 + 50*FORCE_FLUSH, 4)) {
860		DBG(("%s: flushing batch: %d < %d+%d\n",
861		     __FUNCTION__, sna->kgem.surface - sna->kgem.nbatch,
862		     150, 4*8));
863		kgem_submit(&sna->kgem);
864		_kgem_set_mode(&sna->kgem, KGEM_RENDER);
865	}
866
867	if (sna->render_state.gen4.needs_invariant)
868		gen4_emit_invariant(sna);
869}
870
871static void
872gen4_align_vertex(struct sna *sna, const struct sna_composite_op *op)
873{
874	assert(op->floats_per_rect == 3*op->floats_per_vertex);
875	if (op->floats_per_vertex != sna->render_state.gen4.floats_per_vertex) {
876		DBG(("aligning vertex: was %d, now %d floats per vertex\n",
877		     sna->render_state.gen4.floats_per_vertex,
878		     op->floats_per_vertex));
879		gen4_vertex_align(sna, op);
880		sna->render_state.gen4.floats_per_vertex = op->floats_per_vertex;
881	}
882}
883
884static void
885gen4_emit_binding_table(struct sna *sna, uint16_t offset)
886{
887	if (sna->render_state.gen4.surface_table == offset)
888		return;
889
890	sna->render_state.gen4.surface_table = offset;
891
892	/* Binding table pointers */
893	OUT_BATCH(GEN4_3DSTATE_BINDING_TABLE_POINTERS | 4);
894	OUT_BATCH(0);		/* vs */
895	OUT_BATCH(0);		/* gs */
896	OUT_BATCH(0);		/* clip */
897	OUT_BATCH(0);		/* sf */
898	/* Only the PS uses the binding table */
899	OUT_BATCH(offset*4);
900}
901
902static void
903gen4_emit_pipelined_pointers(struct sna *sna,
904			     const struct sna_composite_op *op,
905			     int blend, int kernel)
906{
907	uint16_t sp, bp;
908	uint32_t key;
909
910	DBG(("%s: has_mask=%d, src=(%d, %d), mask=(%d, %d),kernel=%d, blend=%d, ca=%d, format=%x\n",
911	     __FUNCTION__, op->u.gen4.ve_id & 2,
912	     op->src.filter, op->src.repeat,
913	     op->mask.filter, op->mask.repeat,
914	     kernel, blend, op->has_component_alpha, (int)op->dst.format));
915
916	sp = SAMPLER_OFFSET(op->src.filter, op->src.repeat,
917			    op->mask.filter, op->mask.repeat,
918			    kernel);
919	bp = gen4_get_blend(blend, op->has_component_alpha, op->dst.format);
920
921	DBG(("%s: sp=%d, bp=%d\n", __FUNCTION__, sp, bp));
922	key = sp | (uint32_t)bp << 16;
923	if (key == sna->render_state.gen4.last_pipelined_pointers)
924		return;
925
926	OUT_BATCH(GEN4_3DSTATE_PIPELINED_POINTERS | 5);
927	OUT_BATCH(sna->render_state.gen4.vs);
928	OUT_BATCH(GEN4_GS_DISABLE); /* passthrough */
929	OUT_BATCH(GEN4_CLIP_DISABLE); /* passthrough */
930	OUT_BATCH(sna->render_state.gen4.sf);
931	OUT_BATCH(sna->render_state.gen4.wm + sp);
932	OUT_BATCH(sna->render_state.gen4.cc + bp);
933
934	sna->render_state.gen4.last_pipelined_pointers = key;
935	gen4_emit_urb(sna);
936}
937
938static bool
939gen4_emit_drawing_rectangle(struct sna *sna, const struct sna_composite_op *op)
940{
941	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
942	uint32_t offset = (uint16_t)op->dst.y << 16 | (uint16_t)op->dst.x;
943
944	assert(!too_large(abs(op->dst.x), abs(op->dst.y)));
945	assert(!too_large(op->dst.width, op->dst.height));
946
947	if (sna->render_state.gen4.drawrect_limit == limit &&
948	    sna->render_state.gen4.drawrect_offset == offset)
949		return true;
950
951	sna->render_state.gen4.drawrect_offset = offset;
952	sna->render_state.gen4.drawrect_limit = limit;
953
954	OUT_BATCH(GEN4_3DSTATE_DRAWING_RECTANGLE | (4 - 2));
955	OUT_BATCH(0);
956	OUT_BATCH(limit);
957	OUT_BATCH(offset);
958	return false;
959}
960
961static void
962gen4_emit_vertex_elements(struct sna *sna,
963			  const struct sna_composite_op *op)
964{
965	/*
966	 * vertex data in vertex buffer
967	 *    position: (x, y)
968	 *    texture coordinate 0: (u0, v0) if (is_affine is true) else (u0, v0, w0)
969	 *    texture coordinate 1 if (has_mask is true): same as above
970	 */
971	struct gen4_render_state *render = &sna->render_state.gen4;
972	uint32_t src_format, dw;
973	int id = op->u.gen4.ve_id;
974
975	if (render->ve_id == id)
976		return;
977	render->ve_id = id;
978
979	/* The VUE layout
980	 *    dword 0-3: position (x, y, 1.0, 1.0),
981	 *    dword 4-7: texture coordinate 0 (u0, v0, w0, 1.0)
982	 *    [optional] dword 8-11: texture coordinate 1 (u1, v1, w1, 1.0)
983	 */
984	OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS | (2 * (1 + 2) - 1));
985
986	/* x,y */
987	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
988		  GEN4_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
989		  0 << VE0_OFFSET_SHIFT);
990	OUT_BATCH(VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
991		  VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
992		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
993		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
994		  (1*4) << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
995
996	/* u0, v0, w0 */
997	/* u0, v0, w0 */
998	DBG(("%s: first channel %d floats, offset=4b\n", __FUNCTION__, id & 3));
999	dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
1000	switch (id & 3) {
1001	default:
1002		assert(0);
1003	case 0:
1004		src_format = GEN4_SURFACEFORMAT_R16G16_SSCALED;
1005		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1006		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1007		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1008		break;
1009	case 1:
1010		src_format = GEN4_SURFACEFORMAT_R32_FLOAT;
1011		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1012		dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
1013		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1014		break;
1015	case 2:
1016		src_format = GEN4_SURFACEFORMAT_R32G32_FLOAT;
1017		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1018		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1019		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1020		break;
1021	case 3:
1022		src_format = GEN4_SURFACEFORMAT_R32G32B32_FLOAT;
1023		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1024		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1025		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
1026		break;
1027	}
1028	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1029		  src_format << VE0_FORMAT_SHIFT |
1030		  4 << VE0_OFFSET_SHIFT);
1031	OUT_BATCH(dw | 8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
1032
1033	/* u1, v1, w1 */
1034	if (id >> 2) {
1035		unsigned src_offset = 4 + ((id & 3) ?: 1) * sizeof(float);
1036		DBG(("%s: second channel %d floats, offset=%db\n", __FUNCTION__,
1037		     id >> 2, src_offset));
1038		dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
1039		switch (id >> 2) {
1040		case 1:
1041			src_format = GEN4_SURFACEFORMAT_R32_FLOAT;
1042			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1043			dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
1044			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1045			break;
1046		default:
1047			assert(0);
1048		case 2:
1049			src_format = GEN4_SURFACEFORMAT_R32G32_FLOAT;
1050			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1051			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1052			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
1053			break;
1054		case 3:
1055			src_format = GEN4_SURFACEFORMAT_R32G32B32_FLOAT;
1056			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
1057			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
1058			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
1059			break;
1060		}
1061		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1062			  src_format << VE0_FORMAT_SHIFT |
1063			  src_offset << VE0_OFFSET_SHIFT);
1064		OUT_BATCH(dw | 12 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
1065	} else {
1066		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
1067			  GEN4_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
1068			  0 << VE0_OFFSET_SHIFT);
1069		OUT_BATCH(VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
1070			  VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
1071			  VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
1072			  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
1073			  12 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
1074	}
1075}
1076
1077static void
1078gen4_emit_state(struct sna *sna,
1079		const struct sna_composite_op *op,
1080		uint16_t wm_binding_table)
1081{
1082	bool flush;
1083
1084	assert(op->dst.bo->exec);
1085
1086	flush = wm_binding_table & 1;
1087	wm_binding_table &= ~1;
1088
1089	if (ALWAYS_FLUSH || kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
1090		DBG(("%s: flushing dirty (%d, %d), forced? %d\n", __FUNCTION__,
1091		     kgem_bo_is_dirty(op->src.bo),
1092		     kgem_bo_is_dirty(op->mask.bo),
1093		     flush));
1094		gen4_emit_pipe_invalidate(sna);
1095		kgem_clear_dirty(&sna->kgem);
1096		kgem_bo_mark_dirty(op->dst.bo);
1097		flush = false;
1098	}
1099	flush &= gen4_emit_drawing_rectangle(sna, op);
1100	if (flush && op->op > PictOpSrc)
1101		gen4_emit_pipe_flush(sna);
1102
1103	gen4_emit_binding_table(sna, wm_binding_table);
1104	gen4_emit_pipelined_pointers(sna, op, op->op, op->u.gen4.wm_kernel);
1105	gen4_emit_vertex_elements(sna, op);
1106}
1107
1108static void
1109gen4_bind_surfaces(struct sna *sna,
1110		   const struct sna_composite_op *op)
1111{
1112	uint32_t *binding_table;
1113	uint16_t offset, dirty;
1114
1115	gen4_get_batch(sna, op);
1116	dirty = kgem_bo_is_dirty(op->dst.bo);
1117
1118	binding_table = gen4_composite_get_binding_table(sna, &offset);
1119
1120	binding_table[0] =
1121		gen4_bind_bo(sna,
1122			    op->dst.bo, op->dst.width, op->dst.height,
1123			    gen4_get_dest_format(op->dst.format),
1124			    true);
1125	binding_table[1] =
1126		gen4_bind_bo(sna,
1127			     op->src.bo, op->src.width, op->src.height,
1128			     op->src.card_format,
1129			     false);
1130	if (op->mask.bo) {
1131		assert(op->u.gen4.ve_id >> 2);
1132		binding_table[2] =
1133			gen4_bind_bo(sna,
1134				     op->mask.bo,
1135				     op->mask.width,
1136				     op->mask.height,
1137				     op->mask.card_format,
1138				     false);
1139	}
1140
1141	if (sna->kgem.surface == offset &&
1142	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen4.surface_table) == *(uint64_t*)binding_table &&
1143	    (op->mask.bo == NULL ||
1144	     sna->kgem.batch[sna->render_state.gen4.surface_table+2] == binding_table[2])) {
1145		sna->kgem.surface += sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
1146		offset = sna->render_state.gen4.surface_table;
1147	}
1148
1149	if (!ALWAYS_FLUSH && sna->kgem.batch[sna->render_state.gen4.surface_table] == binding_table[0])
1150		dirty = 0;
1151
1152	gen4_emit_state(sna, op, offset | dirty);
1153}
1154
1155fastcall static void
1156gen4_render_composite_blt(struct sna *sna,
1157			  const struct sna_composite_op *op,
1158			  const struct sna_composite_rectangles *r)
1159{
1160	DBG(("%s: src=(%d, %d)+(%d, %d), mask=(%d, %d)+(%d, %d), dst=(%d, %d)+(%d, %d), size=(%d, %d)\n",
1161	     __FUNCTION__,
1162	     r->src.x, r->src.y, op->src.offset[0], op->src.offset[1],
1163	     r->mask.x, r->mask.y, op->mask.offset[0], op->mask.offset[1],
1164	     r->dst.x, r->dst.y, op->dst.x, op->dst.y,
1165	     r->width, r->height));
1166
1167	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
1168	op->prim_emit(sna, op, r);
1169}
1170
1171fastcall static void
1172gen4_render_composite_box(struct sna *sna,
1173			  const struct sna_composite_op *op,
1174			  const BoxRec *box)
1175{
1176	struct sna_composite_rectangles r;
1177
1178	DBG(("  %s: (%d, %d), (%d, %d)\n",
1179	     __FUNCTION__,
1180	     box->x1, box->y1, box->x2, box->y2));
1181
1182	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
1183
1184	r.dst.x = box->x1;
1185	r.dst.y = box->y1;
1186	r.width  = box->x2 - box->x1;
1187	r.height = box->y2 - box->y1;
1188	r.mask = r.src = r.dst;
1189
1190	op->prim_emit(sna, op, &r);
1191}
1192
1193static void
1194gen4_render_composite_boxes__blt(struct sna *sna,
1195				 const struct sna_composite_op *op,
1196				 const BoxRec *box, int nbox)
1197{
1198	DBG(("%s(%d) delta=(%d, %d), src=(%d, %d)/(%d, %d), mask=(%d, %d)/(%d, %d)\n",
1199	     __FUNCTION__, nbox, op->dst.x, op->dst.y,
1200	     op->src.offset[0], op->src.offset[1],
1201	     op->src.width, op->src.height,
1202	     op->mask.offset[0], op->mask.offset[1],
1203	     op->mask.width, op->mask.height));
1204
1205	do {
1206		int nbox_this_time;
1207
1208		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
1209						     gen4_bind_surfaces);
1210		nbox -= nbox_this_time;
1211
1212		do {
1213			struct sna_composite_rectangles r;
1214
1215			DBG(("  %s: (%d, %d), (%d, %d)\n",
1216			     __FUNCTION__,
1217			     box->x1, box->y1, box->x2, box->y2));
1218
1219			r.dst.x = box->x1;
1220			r.dst.y = box->y1;
1221			r.width  = box->x2 - box->x1;
1222			r.height = box->y2 - box->y1;
1223			r.mask = r.src = r.dst;
1224			op->prim_emit(sna, op, &r);
1225			box++;
1226		} while (--nbox_this_time);
1227	} while (nbox);
1228}
1229
1230static void
1231gen4_render_composite_boxes(struct sna *sna,
1232			    const struct sna_composite_op *op,
1233			    const BoxRec *box, int nbox)
1234{
1235	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1236
1237	do {
1238		int nbox_this_time;
1239		float *v;
1240
1241		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
1242						     gen4_bind_surfaces);
1243		assert(nbox_this_time);
1244		nbox -= nbox_this_time;
1245
1246		v = sna->render.vertices + sna->render.vertex_used;
1247		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1248
1249		op->emit_boxes(op, box, nbox_this_time, v);
1250		box += nbox_this_time;
1251	} while (nbox);
1252}
1253
1254#if !FORCE_FLUSH
1255static void
1256gen4_render_composite_boxes__thread(struct sna *sna,
1257				    const struct sna_composite_op *op,
1258				    const BoxRec *box, int nbox)
1259{
1260	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
1261
1262	sna_vertex_lock(&sna->render);
1263	do {
1264		int nbox_this_time;
1265		float *v;
1266
1267		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
1268						     gen4_bind_surfaces);
1269		assert(nbox_this_time);
1270		nbox -= nbox_this_time;
1271
1272		v = sna->render.vertices + sna->render.vertex_used;
1273		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
1274
1275		sna_vertex_acquire__locked(&sna->render);
1276		sna_vertex_unlock(&sna->render);
1277
1278		op->emit_boxes(op, box, nbox_this_time, v);
1279		box += nbox_this_time;
1280
1281		sna_vertex_lock(&sna->render);
1282		sna_vertex_release__locked(&sna->render);
1283	} while (nbox);
1284	sna_vertex_unlock(&sna->render);
1285}
1286#endif
1287
1288#ifndef MAX
1289#define MAX(a,b) ((a) > (b) ? (a) : (b))
1290#endif
1291
1292static uint32_t gen4_bind_video_source(struct sna *sna,
1293				       struct kgem_bo *src_bo,
1294				       uint32_t src_offset,
1295				       int src_width,
1296				       int src_height,
1297				       int src_pitch,
1298				       uint32_t src_surf_format)
1299{
1300	struct gen4_surface_state *ss;
1301
1302	sna->kgem.surface -= sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
1303
1304	ss = memset(sna->kgem.batch + sna->kgem.surface, 0, sizeof(*ss));
1305	ss->ss0.surface_type = GEN4_SURFACE_2D;
1306	ss->ss0.surface_format = src_surf_format;
1307	ss->ss0.color_blend = 1;
1308
1309	ss->ss1.base_addr =
1310		kgem_add_reloc(&sna->kgem,
1311			       sna->kgem.surface + 1,
1312			       src_bo,
1313			       I915_GEM_DOMAIN_SAMPLER << 16,
1314			       src_offset);
1315
1316	ss->ss2.width  = src_width - 1;
1317	ss->ss2.height = src_height - 1;
1318	ss->ss3.pitch  = src_pitch - 1;
1319
1320	return sna->kgem.surface * sizeof(uint32_t);
1321}
1322
1323static void gen4_video_bind_surfaces(struct sna *sna,
1324				     const struct sna_composite_op *op)
1325{
1326	struct sna_video_frame *frame = op->priv;
1327	uint32_t src_surf_format;
1328	uint32_t src_surf_base[6];
1329	int src_width[6];
1330	int src_height[6];
1331	int src_pitch[6];
1332	uint32_t *binding_table;
1333	uint16_t offset, dirty;
1334	int n_src, n;
1335
1336	src_surf_base[0] = 0;
1337	src_surf_base[1] = 0;
1338	src_surf_base[2] = frame->VBufOffset;
1339	src_surf_base[3] = frame->VBufOffset;
1340	src_surf_base[4] = frame->UBufOffset;
1341	src_surf_base[5] = frame->UBufOffset;
1342
1343	if (is_planar_fourcc(frame->id)) {
1344		src_surf_format = GEN4_SURFACEFORMAT_R8_UNORM;
1345		src_width[1]  = src_width[0]  = frame->width;
1346		src_height[1] = src_height[0] = frame->height;
1347		src_pitch[1]  = src_pitch[0]  = frame->pitch[1];
1348		src_width[4]  = src_width[5]  = src_width[2]  = src_width[3] =
1349			frame->width / 2;
1350		src_height[4] = src_height[5] = src_height[2] = src_height[3] =
1351			frame->height / 2;
1352		src_pitch[4]  = src_pitch[5]  = src_pitch[2]  = src_pitch[3] =
1353			frame->pitch[0];
1354		n_src = 6;
1355	} else {
1356		if (frame->id == FOURCC_UYVY)
1357			src_surf_format = GEN4_SURFACEFORMAT_YCRCB_SWAPY;
1358		else
1359			src_surf_format = GEN4_SURFACEFORMAT_YCRCB_NORMAL;
1360
1361		src_width[0]  = frame->width;
1362		src_height[0] = frame->height;
1363		src_pitch[0]  = frame->pitch[0];
1364		n_src = 1;
1365	}
1366
1367	gen4_get_batch(sna, op);
1368	dirty = kgem_bo_is_dirty(op->dst.bo);
1369
1370	binding_table = gen4_composite_get_binding_table(sna, &offset);
1371	binding_table[0] =
1372		gen4_bind_bo(sna,
1373			     op->dst.bo, op->dst.width, op->dst.height,
1374			     gen4_get_dest_format(op->dst.format),
1375			     true);
1376	for (n = 0; n < n_src; n++) {
1377		binding_table[1+n] =
1378			gen4_bind_video_source(sna,
1379					       frame->bo,
1380					       src_surf_base[n],
1381					       src_width[n],
1382					       src_height[n],
1383					       src_pitch[n],
1384					       src_surf_format);
1385	}
1386
1387	if (!ALWAYS_FLUSH && sna->kgem.batch[sna->render_state.gen4.surface_table] == binding_table[0])
1388		dirty = 0;
1389
1390	gen4_emit_state(sna, op, offset | dirty);
1391}
1392
1393static bool
1394gen4_render_video(struct sna *sna,
1395		  struct sna_video *video,
1396		  struct sna_video_frame *frame,
1397		  RegionPtr dstRegion,
1398		  PixmapPtr pixmap)
1399{
1400	struct sna_composite_op tmp;
1401	struct sna_pixmap *priv = sna_pixmap(pixmap);
1402	int dst_width = dstRegion->extents.x2 - dstRegion->extents.x1;
1403	int dst_height = dstRegion->extents.y2 - dstRegion->extents.y1;
1404	int src_width = frame->src.x2 - frame->src.x1;
1405	int src_height = frame->src.y2 - frame->src.y1;
1406	float src_offset_x, src_offset_y;
1407	float src_scale_x, src_scale_y;
1408	int nbox, pix_xoff, pix_yoff;
1409	const BoxRec *box;
1410
1411	DBG(("%s: %dx%d -> %dx%d\n", __FUNCTION__,
1412	     src_width, src_height, dst_width, dst_height));
1413
1414	assert(priv->gpu_bo);
1415	memset(&tmp, 0, sizeof(tmp));
1416
1417	tmp.op = PictOpSrc;
1418	tmp.dst.pixmap = pixmap;
1419	tmp.dst.width  = pixmap->drawable.width;
1420	tmp.dst.height = pixmap->drawable.height;
1421	tmp.dst.format = sna_format_for_depth(pixmap->drawable.depth);
1422	tmp.dst.bo = priv->gpu_bo;
1423
1424	if (src_width == dst_width && src_height == dst_height)
1425		tmp.src.filter = SAMPLER_FILTER_NEAREST;
1426	else
1427		tmp.src.filter = SAMPLER_FILTER_BILINEAR;
1428	tmp.src.repeat = SAMPLER_EXTEND_PAD;
1429	tmp.src.bo = frame->bo;
1430	tmp.mask.bo = NULL;
1431	tmp.u.gen4.wm_kernel =
1432		is_planar_fourcc(frame->id) ? WM_KERNEL_VIDEO_PLANAR : WM_KERNEL_VIDEO_PACKED;
1433	tmp.u.gen4.ve_id = 2;
1434	tmp.is_affine = true;
1435	tmp.floats_per_vertex = 3;
1436	tmp.floats_per_rect = 9;
1437	tmp.priv = frame;
1438
1439	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
1440		kgem_submit(&sna->kgem);
1441		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL))
1442			return false;
1443	}
1444
1445	gen4_align_vertex(sna, &tmp);
1446	gen4_video_bind_surfaces(sna, &tmp);
1447
1448	/* Set up the offset for translating from the given region (in screen
1449	 * coordinates) to the backing pixmap.
1450	 */
1451#ifdef COMPOSITE
1452	pix_xoff = -pixmap->screen_x + pixmap->drawable.x;
1453	pix_yoff = -pixmap->screen_y + pixmap->drawable.y;
1454#else
1455	pix_xoff = 0;
1456	pix_yoff = 0;
1457#endif
1458
1459	src_scale_x = (float)src_width / dst_width / frame->width;
1460	src_offset_x = (float)frame->src.x1 / frame->width - dstRegion->extents.x1 * src_scale_x;
1461
1462	src_scale_y = (float)src_height / dst_height / frame->height;
1463	src_offset_y = (float)frame->src.y1 / frame->height - dstRegion->extents.y1 * src_scale_y;
1464
1465	box = region_rects(dstRegion);
1466	nbox = region_num_rects(dstRegion);
1467	do {
1468		int n;
1469
1470		n = gen4_get_rectangles(sna, &tmp, nbox,
1471					gen4_video_bind_surfaces);
1472		assert(n);
1473		nbox -= n;
1474
1475		do {
1476			BoxRec r;
1477
1478			r.x1 = box->x1 + pix_xoff;
1479			r.x2 = box->x2 + pix_xoff;
1480			r.y1 = box->y1 + pix_yoff;
1481			r.y2 = box->y2 + pix_yoff;
1482
1483			OUT_VERTEX(r.x2, r.y2);
1484			OUT_VERTEX_F(box->x2 * src_scale_x + src_offset_x);
1485			OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
1486
1487			OUT_VERTEX(r.x1, r.y2);
1488			OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
1489			OUT_VERTEX_F(box->y2 * src_scale_y + src_offset_y);
1490
1491			OUT_VERTEX(r.x1, r.y1);
1492			OUT_VERTEX_F(box->x1 * src_scale_x + src_offset_x);
1493			OUT_VERTEX_F(box->y1 * src_scale_y + src_offset_y);
1494
1495			if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
1496				sna_damage_add_box(&priv->gpu_damage, &r);
1497				sna_damage_subtract_box(&priv->cpu_damage, &r);
1498			}
1499			box++;
1500		} while (--n);
1501	} while (nbox);
1502	gen4_vertex_flush(sna);
1503
1504	return true;
1505}
1506
1507static int
1508gen4_composite_picture(struct sna *sna,
1509		       PicturePtr picture,
1510		       struct sna_composite_channel *channel,
1511		       int x, int y,
1512		       int w, int h,
1513		       int dst_x, int dst_y,
1514		       bool precise)
1515{
1516	PixmapPtr pixmap;
1517	uint32_t color;
1518	int16_t dx, dy;
1519
1520	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
1521	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
1522
1523	channel->is_solid = false;
1524	channel->card_format = -1;
1525
1526	if (sna_picture_is_solid(picture, &color))
1527		return gen4_channel_init_solid(sna, channel, color);
1528
1529	if (picture->pDrawable == NULL) {
1530		int ret;
1531
1532		if (picture->pSourcePict->type == SourcePictTypeLinear)
1533			return gen4_channel_init_linear(sna, picture, channel,
1534							x, y,
1535							w, h,
1536							dst_x, dst_y);
1537
1538		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
1539		ret = -1;
1540		if (!precise)
1541			ret = sna_render_picture_approximate_gradient(sna, picture, channel,
1542								      x, y, w, h, dst_x, dst_y);
1543		if (ret == -1)
1544			ret = sna_render_picture_fixup(sna, picture, channel,
1545						       x, y, w, h, dst_x, dst_y);
1546		return ret;
1547	}
1548
1549	if (picture->alphaMap) {
1550		DBG(("%s -- fallback, alphamap\n", __FUNCTION__));
1551		return sna_render_picture_fixup(sna, picture, channel,
1552						x, y, w, h, dst_x, dst_y);
1553	}
1554
1555	if (!gen4_check_repeat(picture)) {
1556		DBG(("%s: unknown repeat mode fixup\n", __FUNCTION__));
1557		return sna_render_picture_fixup(sna, picture, channel,
1558						x, y, w, h, dst_x, dst_y);
1559	}
1560
1561	if (!gen4_check_filter(picture)) {
1562		DBG(("%s: unhandled filter fixup\n", __FUNCTION__));
1563		return sna_render_picture_fixup(sna, picture, channel,
1564						x, y, w, h, dst_x, dst_y);
1565	}
1566
1567	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
1568	channel->filter = picture->filter;
1569
1570	pixmap = get_drawable_pixmap(picture->pDrawable);
1571	get_drawable_deltas(picture->pDrawable, pixmap, &dx, &dy);
1572
1573	x += dx + picture->pDrawable->x;
1574	y += dy + picture->pDrawable->y;
1575
1576	channel->is_affine = sna_transform_is_affine(picture->transform);
1577	if (sna_transform_is_imprecise_integer_translation(picture->transform, picture->filter, precise, &dx, &dy)) {
1578		DBG(("%s: integer translation (%d, %d), removing\n",
1579		     __FUNCTION__, dx, dy));
1580		x += dx;
1581		y += dy;
1582		channel->transform = NULL;
1583		channel->filter = PictFilterNearest;
1584
1585		if (channel->repeat &&
1586		    (x >= 0 &&
1587		     y >= 0 &&
1588		     x + w < pixmap->drawable.width &&
1589		     y + h < pixmap->drawable.height)) {
1590			struct sna_pixmap *priv = sna_pixmap(pixmap);
1591			if (priv && priv->clear) {
1592				DBG(("%s: converting large pixmap source into solid [%08x]\n", __FUNCTION__, priv->clear_color));
1593				return gen4_channel_init_solid(sna, channel, priv->clear_color);
1594			}
1595		}
1596	} else
1597		channel->transform = picture->transform;
1598
1599	channel->pict_format = picture->format;
1600	channel->card_format = gen4_get_card_format(picture->format);
1601	if (channel->card_format == -1)
1602		return sna_render_picture_convert(sna, picture, channel, pixmap,
1603						  x, y, w, h, dst_x, dst_y,
1604						  false);
1605
1606	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
1607		return sna_render_picture_extract(sna, picture, channel,
1608						  x, y, w, h, dst_x, dst_y);
1609
1610	return sna_render_pixmap_bo(sna, channel, pixmap,
1611				    x, y, w, h, dst_x, dst_y);
1612}
1613
1614static void gen4_composite_channel_convert(struct sna_composite_channel *channel)
1615{
1616	DBG(("%s: repeat %d -> %d, filter %d -> %d\n",
1617	     __FUNCTION__,
1618	     channel->repeat, gen4_repeat(channel->repeat),
1619	     channel->filter, gen4_repeat(channel->filter)));
1620	channel->repeat = gen4_repeat(channel->repeat);
1621	channel->filter = gen4_filter(channel->filter);
1622	if (channel->card_format == (unsigned)-1)
1623		channel->card_format = gen4_get_card_format(channel->pict_format);
1624}
1625
1626static void
1627gen4_render_composite_done(struct sna *sna,
1628			   const struct sna_composite_op *op)
1629{
1630	DBG(("%s()\n", __FUNCTION__));
1631
1632	if (sna->render.vertex_offset) {
1633		gen4_vertex_flush(sna);
1634		gen4_magic_ca_pass(sna, op);
1635	}
1636
1637	if (op->mask.bo)
1638		kgem_bo_destroy(&sna->kgem, op->mask.bo);
1639	if (op->src.bo)
1640		kgem_bo_destroy(&sna->kgem, op->src.bo);
1641
1642	sna_render_composite_redirect_done(sna, op);
1643}
1644
1645static bool
1646gen4_composite_set_target(struct sna *sna,
1647			  struct sna_composite_op *op,
1648			  PicturePtr dst,
1649			  int x, int y, int w, int h,
1650			  bool partial)
1651{
1652	BoxRec box;
1653	unsigned hint;
1654
1655	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
1656	op->dst.width  = op->dst.pixmap->drawable.width;
1657	op->dst.height = op->dst.pixmap->drawable.height;
1658	op->dst.format = dst->format;
1659	if (w && h) {
1660		box.x1 = x;
1661		box.y1 = y;
1662		box.x2 = x + w;
1663		box.y2 = y + h;
1664	} else
1665		sna_render_picture_extents(dst, &box);
1666
1667	hint = PREFER_GPU | FORCE_GPU | RENDER_GPU;
1668	if (!partial) {
1669		hint |= IGNORE_DAMAGE;
1670		if (w == op->dst.width && h == op->dst.height)
1671			hint |= REPLACES;
1672	}
1673
1674	op->dst.bo = sna_drawable_use_bo(dst->pDrawable, hint, &box, &op->damage);
1675	if (op->dst.bo == NULL)
1676		return false;
1677
1678	if (hint & REPLACES) {
1679		struct sna_pixmap *priv = sna_pixmap(op->dst.pixmap);
1680		kgem_bo_pair_undo(&sna->kgem, priv->gpu_bo, priv->cpu_bo);
1681	}
1682
1683	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
1684			    &op->dst.x, &op->dst.y);
1685
1686	DBG(("%s: pixmap=%ld, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
1687	     __FUNCTION__,
1688	     op->dst.pixmap->drawable.serialNumber, (int)op->dst.format,
1689	     op->dst.width, op->dst.height,
1690	     op->dst.bo->pitch,
1691	     op->dst.x, op->dst.y,
1692	     op->damage ? *op->damage : (void *)-1));
1693
1694	assert(op->dst.bo->proxy == NULL);
1695
1696	if (too_large(op->dst.width, op->dst.height) &&
1697	    !sna_render_composite_redirect(sna, op, x, y, w, h, partial))
1698		return false;
1699
1700	return true;
1701}
1702
1703static bool
1704check_gradient(PicturePtr picture, bool precise)
1705{
1706	switch (picture->pSourcePict->type) {
1707	case SourcePictTypeSolidFill:
1708	case SourcePictTypeLinear:
1709		return false;
1710	default:
1711		return precise;
1712	}
1713}
1714
1715static bool
1716has_alphamap(PicturePtr p)
1717{
1718	return p->alphaMap != NULL;
1719}
1720
1721static bool
1722need_upload(struct sna *sna, PicturePtr p)
1723{
1724	return p->pDrawable && untransformed(p) &&
1725		!is_gpu(sna, p->pDrawable, PREFER_GPU_RENDER);
1726}
1727
1728static bool
1729source_is_busy(PixmapPtr pixmap)
1730{
1731	struct sna_pixmap *priv = sna_pixmap(pixmap);
1732	if (priv == NULL)
1733		return false;
1734
1735	if (priv->clear)
1736		return false;
1737
1738	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
1739		return true;
1740
1741	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
1742		return true;
1743
1744	return priv->gpu_damage && !priv->cpu_damage;
1745}
1746
1747static bool
1748source_fallback(struct sna *sna, PicturePtr p, PixmapPtr pixmap, bool precise)
1749{
1750	if (sna_picture_is_solid(p, NULL))
1751		return false;
1752
1753	if (p->pSourcePict)
1754		return check_gradient(p, precise);
1755
1756	if (!gen4_check_repeat(p) || !gen4_check_format(p->format))
1757		return true;
1758
1759	/* soft errors: perfer to upload/compute rather than readback */
1760	if (pixmap && source_is_busy(pixmap))
1761		return false;
1762
1763	return has_alphamap(p) || !gen4_check_filter(p) || need_upload(sna, p);
1764}
1765
1766static bool
1767gen4_composite_fallback(struct sna *sna,
1768			PicturePtr src,
1769			PicturePtr mask,
1770			PicturePtr dst)
1771{
1772	PixmapPtr src_pixmap;
1773	PixmapPtr mask_pixmap;
1774	PixmapPtr dst_pixmap;
1775	bool src_fallback, mask_fallback;
1776
1777	if (!gen4_check_dst_format(dst->format)) {
1778		DBG(("%s: unknown destination format: %d\n",
1779		     __FUNCTION__, dst->format));
1780		return true;
1781	}
1782
1783	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
1784
1785	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
1786	src_fallback = source_fallback(sna, src, src_pixmap,
1787				       dst->polyMode == PolyModePrecise);
1788
1789	if (mask) {
1790		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
1791		mask_fallback = source_fallback(sna, mask, mask_pixmap,
1792						dst->polyMode == PolyModePrecise);
1793	} else {
1794		mask_pixmap = NULL;
1795		mask_fallback = false;
1796	}
1797
1798	/* If we are using the destination as a source and need to
1799	 * readback in order to upload the source, do it all
1800	 * on the cpu.
1801	 */
1802	if (src_pixmap == dst_pixmap && src_fallback) {
1803		DBG(("%s: src is dst and will fallback\n",__FUNCTION__));
1804		return true;
1805	}
1806	if (mask_pixmap == dst_pixmap && mask_fallback) {
1807		DBG(("%s: mask is dst and will fallback\n",__FUNCTION__));
1808		return true;
1809	}
1810
1811	/* If anything is on the GPU, push everything out to the GPU */
1812	if (dst_use_gpu(dst_pixmap)) {
1813		DBG(("%s: dst is already on the GPU, try to use GPU\n",
1814		     __FUNCTION__));
1815		return false;
1816	}
1817
1818	if (src_pixmap && !src_fallback) {
1819		DBG(("%s: src is already on the GPU, try to use GPU\n",
1820		     __FUNCTION__));
1821		return false;
1822	}
1823	if (mask_pixmap && !mask_fallback) {
1824		DBG(("%s: mask is already on the GPU, try to use GPU\n",
1825		     __FUNCTION__));
1826		return false;
1827	}
1828
1829	/* However if the dst is not on the GPU and we need to
1830	 * render one of the sources using the CPU, we may
1831	 * as well do the entire operation in place onthe CPU.
1832	 */
1833	if (src_fallback) {
1834		DBG(("%s: dst is on the CPU and src will fallback\n",
1835		     __FUNCTION__));
1836		return true;
1837	}
1838
1839	if (mask_fallback) {
1840		DBG(("%s: dst is on the CPU and mask will fallback\n",
1841		     __FUNCTION__));
1842		return true;
1843	}
1844
1845	if (too_large(dst_pixmap->drawable.width,
1846		      dst_pixmap->drawable.height) &&
1847	    dst_is_cpu(dst_pixmap)) {
1848		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
1849		return true;
1850	}
1851
1852	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
1853	     __FUNCTION__));
1854	return dst_use_cpu(dst_pixmap);
1855}
1856
1857static int
1858reuse_source(struct sna *sna,
1859	     PicturePtr src, struct sna_composite_channel *sc, int src_x, int src_y,
1860	     PicturePtr mask, struct sna_composite_channel *mc, int msk_x, int msk_y)
1861{
1862	uint32_t color;
1863
1864	if (src_x != msk_x || src_y != msk_y)
1865		return false;
1866
1867	if (src == mask) {
1868		DBG(("%s: mask is source\n", __FUNCTION__));
1869		*mc = *sc;
1870		mc->bo = kgem_bo_reference(mc->bo);
1871		return true;
1872	}
1873
1874	if (sna_picture_is_solid(mask, &color))
1875		return gen4_channel_init_solid(sna, mc, color);
1876
1877	if (sc->is_solid)
1878		return false;
1879
1880	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
1881		return false;
1882
1883	DBG(("%s: mask reuses source drawable\n", __FUNCTION__));
1884
1885	if (!sna_transform_equal(src->transform, mask->transform))
1886		return false;
1887
1888	if (!sna_picture_alphamap_equal(src, mask))
1889		return false;
1890
1891	if (!gen4_check_repeat(mask))
1892		return false;
1893
1894	if (!gen4_check_filter(mask))
1895		return false;
1896
1897	if (!gen4_check_format(mask->format))
1898		return false;
1899
1900	DBG(("%s: reusing source channel for mask with a twist\n",
1901	     __FUNCTION__));
1902
1903	*mc = *sc;
1904	mc->repeat = gen4_repeat(mask->repeat ? mask->repeatType : RepeatNone);
1905	mc->filter = gen4_filter(mask->filter);
1906	mc->pict_format = mask->format;
1907	mc->card_format = gen4_get_card_format(mask->format);
1908	mc->bo = kgem_bo_reference(mc->bo);
1909	return true;
1910}
1911
1912static bool
1913gen4_render_composite(struct sna *sna,
1914		      uint8_t op,
1915		      PicturePtr src,
1916		      PicturePtr mask,
1917		      PicturePtr dst,
1918		      int16_t src_x, int16_t src_y,
1919		      int16_t msk_x, int16_t msk_y,
1920		      int16_t dst_x, int16_t dst_y,
1921		      int16_t width, int16_t height,
1922		      unsigned flags,
1923		      struct sna_composite_op *tmp)
1924{
1925	DBG(("%s: %dx%d, current mode=%d\n", __FUNCTION__,
1926	     width, height, sna->kgem.mode));
1927
1928	if (op >= ARRAY_SIZE(gen4_blend_op))
1929		return false;
1930
1931	if (mask == NULL &&
1932	    sna_blt_composite(sna, op,
1933			      src, dst,
1934			      src_x, src_y,
1935			      dst_x, dst_y,
1936			      width, height,
1937			      flags, tmp))
1938		return true;
1939
1940	if (gen4_composite_fallback(sna, src, mask, dst))
1941		goto fallback;
1942
1943	if (need_tiling(sna, width, height))
1944		return sna_tiling_composite(op, src, mask, dst,
1945					    src_x, src_y,
1946					    msk_x, msk_y,
1947					    dst_x, dst_y,
1948					    width, height,
1949					    tmp);
1950
1951	if (!gen4_composite_set_target(sna, tmp, dst,
1952				       dst_x, dst_y, width, height,
1953				       flags & COMPOSITE_PARTIAL || op > PictOpSrc)) {
1954		DBG(("%s: failed to set composite target\n", __FUNCTION__));
1955		goto fallback;
1956	}
1957
1958	tmp->op = op;
1959	switch (gen4_composite_picture(sna, src, &tmp->src,
1960				       src_x, src_y,
1961				       width, height,
1962				       dst_x, dst_y,
1963				       dst->polyMode == PolyModePrecise)) {
1964	case -1:
1965		DBG(("%s: failed to prepare source\n", __FUNCTION__));
1966		goto cleanup_dst;
1967	case 0:
1968		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
1969			goto cleanup_dst;
1970		/* fall through to fixup */
1971	case 1:
1972		if (mask == NULL &&
1973		    sna_blt_composite__convert(sna,
1974					       dst_x, dst_y, width, height,
1975					       tmp))
1976			return true;
1977
1978		gen4_composite_channel_convert(&tmp->src);
1979		break;
1980	}
1981
1982	tmp->is_affine = tmp->src.is_affine;
1983	tmp->has_component_alpha = false;
1984	tmp->need_magic_ca_pass = false;
1985
1986	if (mask) {
1987		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
1988			tmp->has_component_alpha = true;
1989
1990			/* Check if it's component alpha that relies on a source alpha and on
1991			 * the source value.  We can only get one of those into the single
1992			 * source value that we get to blend with.
1993			 */
1994			if (gen4_blend_op[op].src_alpha &&
1995			    (gen4_blend_op[op].src_blend != GEN4_BLENDFACTOR_ZERO)) {
1996				if (op != PictOpOver) {
1997					DBG(("%s -- fallback: unhandled component alpha blend\n",
1998					     __FUNCTION__));
1999
2000					goto cleanup_src;
2001				}
2002
2003				tmp->need_magic_ca_pass = true;
2004				tmp->op = PictOpOutReverse;
2005			}
2006		}
2007
2008		if (!reuse_source(sna,
2009				  src, &tmp->src, src_x, src_y,
2010				  mask, &tmp->mask, msk_x, msk_y)) {
2011			switch (gen4_composite_picture(sna, mask, &tmp->mask,
2012						       msk_x, msk_y,
2013						       width, height,
2014						       dst_x, dst_y,
2015						       dst->polyMode == PolyModePrecise)) {
2016			case -1:
2017				DBG(("%s: failed to prepare mask\n", __FUNCTION__));
2018				goto cleanup_src;
2019			case 0:
2020				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
2021					goto cleanup_src;
2022				/* fall through to fixup */
2023			case 1:
2024				gen4_composite_channel_convert(&tmp->mask);
2025				break;
2026			}
2027		}
2028
2029		tmp->is_affine &= tmp->mask.is_affine;
2030	}
2031
2032	tmp->u.gen4.wm_kernel =
2033		gen4_choose_composite_kernel(tmp->op,
2034					     tmp->mask.bo != NULL,
2035					     tmp->has_component_alpha,
2036					     tmp->is_affine);
2037	tmp->u.gen4.ve_id = gen4_choose_composite_emitter(sna, tmp);
2038
2039	tmp->blt   = gen4_render_composite_blt;
2040	tmp->box   = gen4_render_composite_box;
2041	tmp->boxes = gen4_render_composite_boxes__blt;
2042	if (tmp->emit_boxes) {
2043		tmp->boxes = gen4_render_composite_boxes;
2044#if !FORCE_FLUSH
2045		tmp->thread_boxes = gen4_render_composite_boxes__thread;
2046#endif
2047	}
2048	tmp->done  = gen4_render_composite_done;
2049
2050	if (!kgem_check_bo(&sna->kgem,
2051			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2052			   NULL)) {
2053		kgem_submit(&sna->kgem);
2054		if (!kgem_check_bo(&sna->kgem,
2055				     tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
2056				     NULL))
2057			goto cleanup_mask;
2058	}
2059
2060	gen4_align_vertex(sna, tmp);
2061	gen4_bind_surfaces(sna, tmp);
2062	return true;
2063
2064cleanup_mask:
2065	if (tmp->mask.bo) {
2066		kgem_bo_destroy(&sna->kgem, tmp->mask.bo);
2067		tmp->mask.bo = NULL;
2068	}
2069cleanup_src:
2070	if (tmp->src.bo) {
2071		kgem_bo_destroy(&sna->kgem, tmp->src.bo);
2072		tmp->src.bo = NULL;
2073	}
2074cleanup_dst:
2075	if (tmp->redirect.real_bo) {
2076		kgem_bo_destroy(&sna->kgem, tmp->dst.bo);
2077		tmp->redirect.real_bo = NULL;
2078	}
2079fallback:
2080	return (mask == NULL &&
2081		sna_blt_composite(sna, op,
2082				  src, dst,
2083				  src_x, src_y,
2084				  dst_x, dst_y,
2085				  width, height,
2086				  flags | COMPOSITE_FALLBACK, tmp));
2087}
2088
2089#if !NO_COMPOSITE_SPANS
2090fastcall static void
2091gen4_render_composite_spans_box(struct sna *sna,
2092				const struct sna_composite_spans_op *op,
2093				const BoxRec *box, float opacity)
2094{
2095	DBG(("%s: src=+(%d, %d), opacity=%f, dst=+(%d, %d), box=(%d, %d) x (%d, %d)\n",
2096	     __FUNCTION__,
2097	     op->base.src.offset[0], op->base.src.offset[1],
2098	     opacity,
2099	     op->base.dst.x, op->base.dst.y,
2100	     box->x1, box->y1,
2101	     box->x2 - box->x1,
2102	     box->y2 - box->y1));
2103
2104	gen4_get_rectangles(sna, &op->base, 1, gen4_bind_surfaces);
2105	op->prim_emit(sna, op, box, opacity);
2106}
2107
2108static void
2109gen4_render_composite_spans_boxes(struct sna *sna,
2110				  const struct sna_composite_spans_op *op,
2111				  const BoxRec *box, int nbox,
2112				  float opacity)
2113{
2114	DBG(("%s: nbox=%d, src=+(%d, %d), opacity=%f, dst=+(%d, %d)\n",
2115	     __FUNCTION__, nbox,
2116	     op->base.src.offset[0], op->base.src.offset[1],
2117	     opacity,
2118	     op->base.dst.x, op->base.dst.y));
2119
2120	do {
2121		int nbox_this_time;
2122
2123		nbox_this_time = gen4_get_rectangles(sna, &op->base, nbox,
2124						     gen4_bind_surfaces);
2125		nbox -= nbox_this_time;
2126
2127		do {
2128			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
2129			     box->x1, box->y1,
2130			     box->x2 - box->x1,
2131			     box->y2 - box->y1));
2132
2133			op->prim_emit(sna, op, box++, opacity);
2134		} while (--nbox_this_time);
2135	} while (nbox);
2136}
2137
2138fastcall static void
2139gen4_render_composite_spans_boxes__thread(struct sna *sna,
2140					  const struct sna_composite_spans_op *op,
2141					  const struct sna_opacity_box *box,
2142					  int nbox)
2143{
2144	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
2145	     __FUNCTION__, nbox,
2146	     op->base.src.offset[0], op->base.src.offset[1],
2147	     op->base.dst.x, op->base.dst.y));
2148	assert(nbox);
2149
2150	sna_vertex_lock(&sna->render);
2151	do {
2152		int nbox_this_time;
2153		float *v;
2154
2155		nbox_this_time = gen4_get_rectangles(sna, &op->base, nbox,
2156						     gen4_bind_surfaces);
2157		assert(nbox_this_time);
2158		nbox -= nbox_this_time;
2159
2160		v = sna->render.vertices + sna->render.vertex_used;
2161		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
2162
2163		sna_vertex_acquire__locked(&sna->render);
2164		sna_vertex_unlock(&sna->render);
2165
2166		op->emit_boxes(op, box, nbox_this_time, v);
2167		box += nbox_this_time;
2168
2169		sna_vertex_lock(&sna->render);
2170		sna_vertex_release__locked(&sna->render);
2171	} while (nbox);
2172	sna_vertex_unlock(&sna->render);
2173}
2174
2175fastcall static void
2176gen4_render_composite_spans_done(struct sna *sna,
2177				 const struct sna_composite_spans_op *op)
2178{
2179	if (sna->render.vertex_offset)
2180		gen4_vertex_flush(sna);
2181
2182	DBG(("%s()\n", __FUNCTION__));
2183
2184	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2185	sna_render_composite_redirect_done(sna, &op->base);
2186}
2187
2188static bool
2189gen4_check_composite_spans(struct sna *sna,
2190			   uint8_t op, PicturePtr src, PicturePtr dst,
2191			   int16_t width, int16_t height,
2192			   unsigned flags)
2193{
2194	DBG(("%s: op=%d, width=%d, height=%d, flags=%x\n",
2195	     __FUNCTION__, op, width, height, flags));
2196
2197	if (op >= ARRAY_SIZE(gen4_blend_op))
2198		return false;
2199
2200	if (gen4_composite_fallback(sna, src, NULL, dst)) {
2201		DBG(("%s: operation would fallback\n", __FUNCTION__));
2202		return false;
2203	}
2204
2205	if (need_tiling(sna, width, height) &&
2206	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
2207		DBG(("%s: fallback, tiled operation not on GPU\n",
2208		     __FUNCTION__));
2209		return false;
2210	}
2211
2212	if (FORCE_SPANS)
2213		return FORCE_SPANS > 0;
2214
2215	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0) {
2216		struct sna_pixmap *priv;
2217
2218		if (FORCE_NONRECTILINEAR_SPANS)
2219			return FORCE_NONRECTILINEAR_SPANS > 0;
2220
2221		if ((sna->render.prefer_gpu & PREFER_GPU_SPANS) == 0)
2222			return false;
2223
2224		priv = sna_pixmap_from_drawable(dst->pDrawable);
2225		assert(priv);
2226
2227		if (priv->cpu_bo &&
2228		    __kgem_bo_is_busy(&sna->kgem, priv->cpu_bo))
2229			return true;
2230
2231		if (flags & COMPOSITE_SPANS_INPLACE_HINT)
2232			return false;
2233
2234		return priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo);
2235	}
2236
2237	return true;
2238}
2239
2240static bool
2241gen4_render_composite_spans(struct sna *sna,
2242			    uint8_t op,
2243			    PicturePtr src,
2244			    PicturePtr dst,
2245			    int16_t src_x,  int16_t src_y,
2246			    int16_t dst_x,  int16_t dst_y,
2247			    int16_t width,  int16_t height,
2248			    unsigned flags,
2249			    struct sna_composite_spans_op *tmp)
2250{
2251	DBG(("%s: %dx%d with flags=%x, current mode=%d\n", __FUNCTION__,
2252	     width, height, flags, sna->kgem.ring));
2253
2254	assert(gen4_check_composite_spans(sna, op, src, dst, width, height, flags));
2255
2256	if (need_tiling(sna, width, height)) {
2257		DBG(("%s: tiling, operation (%dx%d) too wide for pipeline\n",
2258		     __FUNCTION__, width, height));
2259		return sna_tiling_composite_spans(op, src, dst,
2260						  src_x, src_y, dst_x, dst_y,
2261						  width, height, flags, tmp);
2262	}
2263
2264	tmp->base.op = op;
2265	if (!gen4_composite_set_target(sna, &tmp->base, dst,
2266				       dst_x, dst_y, width, height, true))
2267		return false;
2268
2269	switch (gen4_composite_picture(sna, src, &tmp->base.src,
2270				       src_x, src_y,
2271				       width, height,
2272				       dst_x, dst_y,
2273				       dst->polyMode == PolyModePrecise)) {
2274	case -1:
2275		goto cleanup_dst;
2276	case 0:
2277		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
2278			goto cleanup_dst;
2279		/* fall through to fixup */
2280	case 1:
2281		gen4_composite_channel_convert(&tmp->base.src);
2282		break;
2283	}
2284
2285	tmp->base.mask.bo = NULL;
2286	tmp->base.mask.filter = SAMPLER_FILTER_NEAREST;
2287	tmp->base.mask.repeat = SAMPLER_EXTEND_NONE;
2288
2289	tmp->base.is_affine = tmp->base.src.is_affine;
2290	tmp->base.has_component_alpha = false;
2291	tmp->base.need_magic_ca_pass = false;
2292
2293	tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(sna, tmp);
2294	tmp->base.u.gen4.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
2295
2296	tmp->box   = gen4_render_composite_spans_box;
2297	tmp->boxes = gen4_render_composite_spans_boxes;
2298	if (tmp->emit_boxes)
2299		tmp->thread_boxes = gen4_render_composite_spans_boxes__thread;
2300	tmp->done  = gen4_render_composite_spans_done;
2301
2302	if (!kgem_check_bo(&sna->kgem,
2303			   tmp->base.dst.bo, tmp->base.src.bo,
2304			   NULL))  {
2305		kgem_submit(&sna->kgem);
2306		if (!kgem_check_bo(&sna->kgem,
2307				   tmp->base.dst.bo, tmp->base.src.bo,
2308				   NULL))
2309			goto cleanup_src;
2310	}
2311
2312	gen4_align_vertex(sna, &tmp->base);
2313	gen4_bind_surfaces(sna, &tmp->base);
2314	return true;
2315
2316cleanup_src:
2317	if (tmp->base.src.bo)
2318		kgem_bo_destroy(&sna->kgem, tmp->base.src.bo);
2319cleanup_dst:
2320	if (tmp->base.redirect.real_bo)
2321		kgem_bo_destroy(&sna->kgem, tmp->base.dst.bo);
2322	return false;
2323}
2324#endif
2325
2326static void
2327gen4_copy_bind_surfaces(struct sna *sna, const struct sna_composite_op *op)
2328{
2329	uint32_t *binding_table;
2330	uint16_t offset, dirty;
2331
2332	gen4_get_batch(sna, op);
2333	dirty = kgem_bo_is_dirty(op->dst.bo);
2334
2335	binding_table = gen4_composite_get_binding_table(sna, &offset);
2336
2337	binding_table[0] =
2338		gen4_bind_bo(sna,
2339			     op->dst.bo, op->dst.width, op->dst.height,
2340			     gen4_get_dest_format(op->dst.format),
2341			     true);
2342	binding_table[1] =
2343		gen4_bind_bo(sna,
2344			     op->src.bo, op->src.width, op->src.height,
2345			     op->src.card_format,
2346			     false);
2347
2348	if (sna->kgem.surface == offset &&
2349	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen4.surface_table) == *(uint64_t*)binding_table) {
2350		sna->kgem.surface += sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
2351		offset = sna->render_state.gen4.surface_table;
2352	}
2353
2354	if (!ALWAYS_FLUSH && sna->kgem.batch[sna->render_state.gen4.surface_table] == binding_table[0])
2355		dirty = 0;
2356
2357	gen4_emit_state(sna, op, offset | dirty);
2358}
2359
2360static void
2361gen4_render_copy_one(struct sna *sna,
2362		     const struct sna_composite_op *op,
2363		     int sx, int sy,
2364		     int w, int h,
2365		     int dx, int dy)
2366{
2367	gen4_get_rectangles(sna, op, 1, gen4_copy_bind_surfaces);
2368
2369	OUT_VERTEX(dx+w, dy+h);
2370	OUT_VERTEX_F((sx+w)*op->src.scale[0]);
2371	OUT_VERTEX_F((sy+h)*op->src.scale[1]);
2372
2373	OUT_VERTEX(dx, dy+h);
2374	OUT_VERTEX_F(sx*op->src.scale[0]);
2375	OUT_VERTEX_F((sy+h)*op->src.scale[1]);
2376
2377	OUT_VERTEX(dx, dy);
2378	OUT_VERTEX_F(sx*op->src.scale[0]);
2379	OUT_VERTEX_F(sy*op->src.scale[1]);
2380}
2381
2382static bool
2383gen4_render_copy_boxes(struct sna *sna, uint8_t alu,
2384		       const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
2385		       const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
2386		       const BoxRec *box, int n, unsigned flags)
2387{
2388	struct sna_composite_op tmp;
2389
2390	DBG(("%s x %d\n", __FUNCTION__, n));
2391
2392	if (sna_blt_compare_depth(src, dst) &&
2393	    sna_blt_copy_boxes(sna, alu,
2394			       src_bo, src_dx, src_dy,
2395			       dst_bo, dst_dx, dst_dy,
2396			       dst->bitsPerPixel,
2397			       box, n))
2398		return true;
2399
2400	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo) {
2401fallback_blt:
2402		if (!sna_blt_compare_depth(src, dst))
2403			return false;
2404
2405		return sna_blt_copy_boxes_fallback(sna, alu,
2406						   src, src_bo, src_dx, src_dy,
2407						   dst, dst_bo, dst_dx, dst_dy,
2408						   box, n);
2409	}
2410
2411	memset(&tmp, 0, sizeof(tmp));
2412
2413	DBG(("%s (%d, %d)->(%d, %d) x %d\n",
2414	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n));
2415
2416	if (dst->depth == src->depth) {
2417		tmp.dst.format = sna_render_format_for_depth(dst->depth);
2418		tmp.src.pict_format = tmp.dst.format;
2419	} else {
2420		tmp.dst.format = sna_format_for_depth(dst->depth);
2421		tmp.src.pict_format = sna_format_for_depth(src->depth);
2422	}
2423	if (!gen4_check_format(tmp.src.pict_format))
2424		goto fallback_blt;
2425
2426	tmp.op = alu == GXcopy ? PictOpSrc : PictOpClear;
2427
2428	tmp.dst.pixmap = (PixmapPtr)dst;
2429	tmp.dst.width  = dst->width;
2430	tmp.dst.height = dst->height;
2431	tmp.dst.x = tmp.dst.y = 0;
2432	tmp.dst.bo = dst_bo;
2433	tmp.damage = NULL;
2434
2435	sna_render_composite_redirect_init(&tmp);
2436	if (too_large(tmp.dst.width, tmp.dst.height)) {
2437		BoxRec extents = box[0];
2438		int i;
2439
2440		for (i = 1; i < n; i++) {
2441			if (box[i].x1 < extents.x1)
2442				extents.x1 = box[i].x1;
2443			if (box[i].y1 < extents.y1)
2444				extents.y1 = box[i].y1;
2445
2446			if (box[i].x2 > extents.x2)
2447				extents.x2 = box[i].x2;
2448			if (box[i].y2 > extents.y2)
2449				extents.y2 = box[i].y2;
2450		}
2451		if (!sna_render_composite_redirect(sna, &tmp,
2452						   extents.x1 + dst_dx,
2453						   extents.y1 + dst_dy,
2454						   extents.x2 - extents.x1,
2455						   extents.y2 - extents.y1,
2456						   n > 1))
2457			goto fallback_tiled;
2458	}
2459
2460	tmp.src.filter = SAMPLER_FILTER_NEAREST;
2461	tmp.src.repeat = SAMPLER_EXTEND_NONE;
2462	tmp.src.card_format = gen4_get_card_format(tmp.src.pict_format);
2463	if (too_large(src->width, src->height)) {
2464		BoxRec extents = box[0];
2465		int i;
2466
2467		for (i = 1; i < n; i++) {
2468			if (box[i].x1 < extents.x1)
2469				extents.x1 = box[i].x1;
2470			if (box[i].y1 < extents.y1)
2471				extents.y1 = box[i].y1;
2472
2473			if (box[i].x2 > extents.x2)
2474				extents.x2 = box[i].x2;
2475			if (box[i].y2 > extents.y2)
2476				extents.y2 = box[i].y2;
2477		}
2478
2479		if (!sna_render_pixmap_partial(sna, src, src_bo, &tmp.src,
2480					       extents.x1 + src_dx,
2481					       extents.y1 + src_dy,
2482					       extents.x2 - extents.x1,
2483					       extents.y2 - extents.y1))
2484			goto fallback_tiled_dst;
2485	} else {
2486		tmp.src.bo = kgem_bo_reference(src_bo);
2487		tmp.src.width  = src->width;
2488		tmp.src.height = src->height;
2489		tmp.src.offset[0] = tmp.src.offset[1] = 0;
2490		tmp.src.scale[0] = 1.f/src->width;
2491		tmp.src.scale[1] = 1.f/src->height;
2492	}
2493
2494	tmp.is_affine = true;
2495	tmp.floats_per_vertex = 3;
2496	tmp.floats_per_rect = 9;
2497	tmp.u.gen4.wm_kernel = WM_KERNEL;
2498	tmp.u.gen4.ve_id = 2;
2499
2500	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2501		kgem_submit(&sna->kgem);
2502		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2503			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2504			if (tmp.redirect.real_bo)
2505				kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2506
2507			goto fallback_blt;
2508		}
2509	}
2510
2511	dst_dx += tmp.dst.x;
2512	dst_dy += tmp.dst.y;
2513	tmp.dst.x = tmp.dst.y = 0;
2514
2515	src_dx += tmp.src.offset[0];
2516	src_dy += tmp.src.offset[1];
2517
2518	gen4_align_vertex(sna, &tmp);
2519	gen4_copy_bind_surfaces(sna, &tmp);
2520
2521	do {
2522		gen4_render_copy_one(sna, &tmp,
2523				     box->x1 + src_dx, box->y1 + src_dy,
2524				     box->x2 - box->x1, box->y2 - box->y1,
2525				     box->x1 + dst_dx, box->y1 + dst_dy);
2526		box++;
2527	} while (--n);
2528
2529	gen4_vertex_flush(sna);
2530	sna_render_composite_redirect_done(sna, &tmp);
2531	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2532	return true;
2533
2534fallback_tiled_dst:
2535	if (tmp.redirect.real_bo)
2536		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
2537fallback_tiled:
2538	if (sna_blt_compare_depth(src, dst) &&
2539	    sna_blt_copy_boxes(sna, alu,
2540			       src_bo, src_dx, src_dy,
2541			       dst_bo, dst_dx, dst_dy,
2542			       dst->bitsPerPixel,
2543			       box, n))
2544		return true;
2545
2546	return sna_tiling_copy_boxes(sna, alu,
2547				     src, src_bo, src_dx, src_dy,
2548				     dst, dst_bo, dst_dx, dst_dy,
2549				     box, n);
2550}
2551
2552static void
2553gen4_render_copy_blt(struct sna *sna,
2554		     const struct sna_copy_op *op,
2555		     int16_t sx, int16_t sy,
2556		     int16_t w,  int16_t h,
2557		     int16_t dx, int16_t dy)
2558{
2559	gen4_render_copy_one(sna, &op->base, sx, sy, w, h, dx, dy);
2560}
2561
2562static void
2563gen4_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
2564{
2565	if (sna->render.vertex_offset)
2566		gen4_vertex_flush(sna);
2567}
2568
2569static bool
2570gen4_render_copy(struct sna *sna, uint8_t alu,
2571		 PixmapPtr src, struct kgem_bo *src_bo,
2572		 PixmapPtr dst, struct kgem_bo *dst_bo,
2573		 struct sna_copy_op *op)
2574{
2575	DBG(("%s: src=%ld, dst=%ld, alu=%d\n",
2576	     __FUNCTION__,
2577	     src->drawable.serialNumber,
2578	     dst->drawable.serialNumber,
2579	     alu));
2580
2581	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2582	    sna_blt_copy(sna, alu,
2583			 src_bo, dst_bo,
2584			 dst->drawable.bitsPerPixel,
2585			 op))
2586		return true;
2587
2588	if (!(alu == GXcopy || alu == GXclear) || src_bo == dst_bo ||
2589	    too_large(src->drawable.width, src->drawable.height) ||
2590	    too_large(dst->drawable.width, dst->drawable.height)) {
2591fallback:
2592		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
2593			return false;
2594
2595		return sna_blt_copy(sna, alu, src_bo, dst_bo,
2596				    dst->drawable.bitsPerPixel,
2597				    op);
2598	}
2599
2600	if (dst->drawable.depth == src->drawable.depth) {
2601		op->base.dst.format = sna_render_format_for_depth(dst->drawable.depth);
2602		op->base.src.pict_format = op->base.dst.format;
2603	} else {
2604		op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
2605		op->base.src.pict_format = sna_format_for_depth(src->drawable.depth);
2606	}
2607	if (!gen4_check_format(op->base.src.pict_format))
2608		goto fallback;
2609
2610	op->base.op = alu == GXcopy ? PictOpSrc : PictOpClear;
2611
2612	op->base.dst.pixmap = dst;
2613	op->base.dst.width  = dst->drawable.width;
2614	op->base.dst.height = dst->drawable.height;
2615	op->base.dst.bo = dst_bo;
2616
2617	op->base.src.bo = src_bo;
2618	op->base.src.card_format =
2619		gen4_get_card_format(op->base.src.pict_format);
2620	op->base.src.width  = src->drawable.width;
2621	op->base.src.height = src->drawable.height;
2622	op->base.src.scale[0] = 1.f/src->drawable.width;
2623	op->base.src.scale[1] = 1.f/src->drawable.height;
2624	op->base.src.filter = SAMPLER_FILTER_NEAREST;
2625	op->base.src.repeat = SAMPLER_EXTEND_NONE;
2626
2627	op->base.is_affine = true;
2628	op->base.floats_per_vertex = 3;
2629	op->base.floats_per_rect = 9;
2630	op->base.u.gen4.wm_kernel = WM_KERNEL;
2631	op->base.u.gen4.ve_id = 2;
2632
2633	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
2634		kgem_submit(&sna->kgem);
2635		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
2636			goto fallback;
2637	}
2638
2639	if (kgem_bo_is_dirty(src_bo)) {
2640		if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
2641		    sna_blt_copy(sna, alu,
2642				 src_bo, dst_bo,
2643				 dst->drawable.bitsPerPixel,
2644				 op))
2645			return true;
2646	}
2647
2648	gen4_align_vertex(sna, &op->base);
2649	gen4_copy_bind_surfaces(sna, &op->base);
2650
2651	op->blt  = gen4_render_copy_blt;
2652	op->done = gen4_render_copy_done;
2653	return true;
2654}
2655
2656static void
2657gen4_render_fill_rectangle(struct sna *sna,
2658			   const struct sna_composite_op *op,
2659			   int x, int y, int w, int h)
2660{
2661	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
2662
2663	OUT_VERTEX(x+w, y+h);
2664	OUT_VERTEX_F(.5);
2665
2666	OUT_VERTEX(x, y+h);
2667	OUT_VERTEX_F(.5);
2668
2669	OUT_VERTEX(x, y);
2670	OUT_VERTEX_F(.5);
2671}
2672
2673static bool
2674gen4_render_fill_boxes(struct sna *sna,
2675		       CARD8 op,
2676		       PictFormat format,
2677		       const xRenderColor *color,
2678		       const DrawableRec *dst, struct kgem_bo *dst_bo,
2679		       const BoxRec *box, int n)
2680{
2681	struct sna_composite_op tmp;
2682	uint32_t pixel;
2683
2684	if (op >= ARRAY_SIZE(gen4_blend_op)) {
2685		DBG(("%s: fallback due to unhandled blend op: %d\n",
2686		     __FUNCTION__, op));
2687		return false;
2688	}
2689
2690	if (op <= PictOpSrc) {
2691		uint8_t alu = GXinvalid;
2692
2693		pixel = 0;
2694		if (op == PictOpClear)
2695			alu = GXclear;
2696		else if (sna_get_pixel_from_rgba(&pixel,
2697						 color->red,
2698						 color->green,
2699						 color->blue,
2700						 color->alpha,
2701						 format))
2702			alu = GXcopy;
2703
2704		if (alu != GXinvalid &&
2705		    sna_blt_fill_boxes(sna, alu,
2706				       dst_bo, dst->bitsPerPixel,
2707				       pixel, box, n))
2708			return true;
2709
2710		if (!gen4_check_dst_format(format))
2711			return false;
2712
2713		if (too_large(dst->width, dst->height))
2714			return sna_tiling_fill_boxes(sna, op, format, color,
2715						     dst, dst_bo, box, n);
2716	}
2717
2718	if (op == PictOpClear) {
2719		pixel = 0;
2720		op = PictOpSrc;
2721	} else if (!sna_get_pixel_from_rgba(&pixel,
2722					    color->red,
2723					    color->green,
2724					    color->blue,
2725					    color->alpha,
2726					    PICT_a8r8g8b8))
2727		return false;
2728
2729	DBG(("%s(%08x x %d)\n", __FUNCTION__, pixel, n));
2730
2731	memset(&tmp, 0, sizeof(tmp));
2732
2733	tmp.op = op;
2734
2735	tmp.dst.pixmap = (PixmapPtr)dst;
2736	tmp.dst.width  = dst->width;
2737	tmp.dst.height = dst->height;
2738	tmp.dst.format = format;
2739	tmp.dst.bo = dst_bo;
2740
2741	gen4_channel_init_solid(sna, &tmp.src, pixel);
2742
2743	tmp.is_affine = true;
2744	tmp.floats_per_vertex = 2;
2745	tmp.floats_per_rect = 6;
2746	tmp.u.gen4.wm_kernel = WM_KERNEL;
2747	tmp.u.gen4.ve_id = 1;
2748
2749	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2750		kgem_submit(&sna->kgem);
2751		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL))
2752			return false;
2753	}
2754
2755	gen4_align_vertex(sna, &tmp);
2756	gen4_bind_surfaces(sna, &tmp);
2757
2758	do {
2759		gen4_render_fill_rectangle(sna, &tmp,
2760					   box->x1, box->y1,
2761					   box->x2 - box->x1,
2762					   box->y2 - box->y1);
2763		box++;
2764	} while (--n);
2765
2766	gen4_vertex_flush(sna);
2767	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2768	return true;
2769}
2770
2771static void
2772gen4_render_fill_op_blt(struct sna *sna, const struct sna_fill_op *op,
2773			int16_t x, int16_t y, int16_t w, int16_t h)
2774{
2775	gen4_render_fill_rectangle(sna, &op->base, x, y, w, h);
2776}
2777
2778fastcall static void
2779gen4_render_fill_op_box(struct sna *sna,
2780			const struct sna_fill_op *op,
2781			const BoxRec *box)
2782{
2783	gen4_render_fill_rectangle(sna, &op->base,
2784				   box->x1, box->y1,
2785				   box->x2-box->x1, box->y2-box->y1);
2786}
2787
2788fastcall static void
2789gen4_render_fill_op_boxes(struct sna *sna,
2790			  const struct sna_fill_op *op,
2791			  const BoxRec *box,
2792			  int nbox)
2793{
2794	do {
2795		gen4_render_fill_rectangle(sna, &op->base,
2796					   box->x1, box->y1,
2797					   box->x2-box->x1, box->y2-box->y1);
2798		box++;
2799	} while (--nbox);
2800}
2801
2802static void
2803gen4_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
2804{
2805	if (sna->render.vertex_offset)
2806		gen4_vertex_flush(sna);
2807	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2808}
2809
2810static bool
2811gen4_render_fill(struct sna *sna, uint8_t alu,
2812		 PixmapPtr dst, struct kgem_bo *dst_bo,
2813		 uint32_t color, unsigned flags,
2814		 struct sna_fill_op *op)
2815{
2816	if (sna_blt_fill(sna, alu,
2817			 dst_bo, dst->drawable.bitsPerPixel,
2818			 color,
2819			 op))
2820		return true;
2821
2822	if (!(alu == GXcopy || alu == GXclear) ||
2823	    too_large(dst->drawable.width, dst->drawable.height))
2824		return sna_blt_fill(sna, alu,
2825				    dst_bo, dst->drawable.bitsPerPixel,
2826				    color,
2827				    op);
2828
2829	if (alu == GXclear)
2830		color = 0;
2831
2832	op->base.op = color == 0 ? PictOpClear : PictOpSrc;
2833
2834	op->base.dst.pixmap = dst;
2835	op->base.dst.width  = dst->drawable.width;
2836	op->base.dst.height = dst->drawable.height;
2837	op->base.dst.format = sna_format_for_depth(dst->drawable.depth);
2838	op->base.dst.bo = dst_bo;
2839	op->base.dst.x = op->base.dst.y = 0;
2840
2841	op->base.need_magic_ca_pass = 0;
2842	op->base.has_component_alpha = 0;
2843
2844	gen4_channel_init_solid(sna, &op->base.src,
2845				sna_rgba_for_color(color,
2846						   dst->drawable.depth));
2847	op->base.mask.bo = NULL;
2848
2849	op->base.is_affine = true;
2850	op->base.floats_per_vertex = 2;
2851	op->base.floats_per_rect = 6;
2852	op->base.u.gen4.wm_kernel = WM_KERNEL;
2853	op->base.u.gen4.ve_id = 1;
2854
2855	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2856		kgem_submit(&sna->kgem);
2857		if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
2858			kgem_bo_destroy(&sna->kgem, op->base.src.bo);
2859			return false;
2860		}
2861	}
2862
2863	gen4_align_vertex(sna, &op->base);
2864	gen4_bind_surfaces(sna, &op->base);
2865
2866	op->blt   = gen4_render_fill_op_blt;
2867	op->box   = gen4_render_fill_op_box;
2868	op->boxes = gen4_render_fill_op_boxes;
2869	op->points = NULL;
2870	op->done  = gen4_render_fill_op_done;
2871	return true;
2872}
2873
2874static bool
2875gen4_render_fill_one_try_blt(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
2876			     uint32_t color,
2877			     int16_t x1, int16_t y1, int16_t x2, int16_t y2,
2878			     uint8_t alu)
2879{
2880	BoxRec box;
2881
2882	box.x1 = x1;
2883	box.y1 = y1;
2884	box.x2 = x2;
2885	box.y2 = y2;
2886
2887	return sna_blt_fill_boxes(sna, alu,
2888				  bo, dst->drawable.bitsPerPixel,
2889				  color, &box, 1);
2890}
2891
2892static bool
2893gen4_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
2894		     uint32_t color,
2895		     int16_t x1, int16_t y1,
2896		     int16_t x2, int16_t y2,
2897		     uint8_t alu)
2898{
2899	struct sna_composite_op tmp;
2900
2901	DBG(("%s: color=%08x\n", __FUNCTION__, color));
2902
2903	if (gen4_render_fill_one_try_blt(sna, dst, bo, color,
2904					 x1, y1, x2, y2, alu))
2905		return true;
2906
2907	/* Must use the BLT if we can't RENDER... */
2908	if (!(alu == GXcopy || alu == GXclear) ||
2909	    too_large(dst->drawable.width, dst->drawable.height))
2910		return false;
2911
2912	if (alu == GXclear)
2913		color = 0;
2914
2915	tmp.op = color == 0 ? PictOpClear : PictOpSrc;
2916
2917	tmp.dst.pixmap = dst;
2918	tmp.dst.width  = dst->drawable.width;
2919	tmp.dst.height = dst->drawable.height;
2920	tmp.dst.format = sna_format_for_depth(dst->drawable.depth);
2921	tmp.dst.bo = bo;
2922	tmp.dst.x = tmp.dst.y = 0;
2923
2924	gen4_channel_init_solid(sna, &tmp.src,
2925				sna_rgba_for_color(color,
2926						   dst->drawable.depth));
2927	tmp.mask.bo = NULL;
2928	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
2929	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
2930
2931	tmp.is_affine = true;
2932	tmp.floats_per_vertex = 2;
2933	tmp.floats_per_rect = 6;
2934	tmp.has_component_alpha = false;
2935	tmp.need_magic_ca_pass = false;
2936
2937	tmp.u.gen4.wm_kernel = WM_KERNEL;
2938	tmp.u.gen4.ve_id = 1;
2939
2940	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
2941		kgem_submit(&sna->kgem);
2942		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
2943			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2944			return false;
2945		}
2946	}
2947
2948	gen4_align_vertex(sna, &tmp);
2949	gen4_bind_surfaces(sna, &tmp);
2950
2951	gen4_render_fill_rectangle(sna, &tmp, x1, y1, x2 - x1, y2 - y1);
2952
2953	gen4_vertex_flush(sna);
2954	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
2955
2956	return true;
2957}
2958
2959static void gen4_render_reset(struct sna *sna)
2960{
2961	sna->render_state.gen4.needs_invariant = true;
2962	sna->render_state.gen4.needs_urb = true;
2963	sna->render_state.gen4.ve_id = -1;
2964	sna->render_state.gen4.last_primitive = -1;
2965	sna->render_state.gen4.last_pipelined_pointers = -1;
2966
2967	sna->render_state.gen4.drawrect_offset = -1;
2968	sna->render_state.gen4.drawrect_limit = -1;
2969	sna->render_state.gen4.surface_table = 0;
2970
2971	if (sna->render.vbo && !kgem_bo_can_map(&sna->kgem, sna->render.vbo)) {
2972		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
2973		discard_vbo(sna);
2974	}
2975
2976	sna->render.vertex_offset = 0;
2977	sna->render.nvertex_reloc = 0;
2978	sna->render.vb_id = 0;
2979}
2980
2981static void gen4_render_fini(struct sna *sna)
2982{
2983	kgem_bo_destroy(&sna->kgem, sna->render_state.gen4.general_bo);
2984}
2985
2986static uint32_t gen4_create_vs_unit_state(struct sna_static_stream *stream)
2987{
2988	struct gen4_vs_unit_state *vs = sna_static_stream_map(stream, sizeof(*vs), 32);
2989
2990	/* Set up the vertex shader to be disabled (passthrough) */
2991	vs->thread4.nr_urb_entries = URB_VS_ENTRIES;
2992	vs->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
2993	vs->vs6.vs_enable = 0;
2994	vs->vs6.vert_cache_disable = 1;
2995
2996	return sna_static_stream_offsetof(stream, vs);
2997}
2998
2999static uint32_t gen4_create_sf_state(struct sna_static_stream *stream,
3000				     uint32_t kernel)
3001{
3002	struct gen4_sf_unit_state *sf;
3003
3004	sf = sna_static_stream_map(stream, sizeof(*sf), 32);
3005
3006	sf->thread0.grf_reg_count = GEN4_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
3007	sf->thread0.kernel_start_pointer = kernel >> 6;
3008	sf->thread3.const_urb_entry_read_length = 0;	/* no const URBs */
3009	sf->thread3.const_urb_entry_read_offset = 0;	/* no const URBs */
3010	sf->thread3.urb_entry_read_length = 1;	/* 1 URB per vertex */
3011	/* don't smash vertex header, read start from dw8 */
3012	sf->thread3.urb_entry_read_offset = 1;
3013	sf->thread3.dispatch_grf_start_reg = 3;
3014	sf->thread4.max_threads = GEN4_MAX_SF_THREADS - 1;
3015	sf->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
3016	sf->thread4.nr_urb_entries = URB_SF_ENTRIES;
3017	sf->sf5.viewport_transform = false;	/* skip viewport */
3018	sf->sf6.cull_mode = GEN4_CULLMODE_NONE;
3019	sf->sf6.scissor = 0;
3020	sf->sf7.trifan_pv = 2;
3021	sf->sf6.dest_org_vbias = 0x8;
3022	sf->sf6.dest_org_hbias = 0x8;
3023
3024	return sna_static_stream_offsetof(stream, sf);
3025}
3026
3027static uint32_t gen4_create_sampler_state(struct sna_static_stream *stream,
3028					  sampler_filter_t src_filter,
3029					  sampler_extend_t src_extend,
3030					  sampler_filter_t mask_filter,
3031					  sampler_extend_t mask_extend)
3032{
3033	struct gen4_sampler_state *sampler_state;
3034
3035	sampler_state = sna_static_stream_map(stream,
3036					      sizeof(struct gen4_sampler_state) * 2,
3037					      32);
3038	sampler_state_init(&sampler_state[0], src_filter, src_extend);
3039	sampler_state_init(&sampler_state[1], mask_filter, mask_extend);
3040
3041	return sna_static_stream_offsetof(stream, sampler_state);
3042}
3043
3044static void gen4_init_wm_state(struct gen4_wm_unit_state *wm,
3045			       int gen,
3046			       bool has_mask,
3047			       uint32_t kernel,
3048			       uint32_t sampler)
3049{
3050	assert((kernel & 63) == 0);
3051	wm->thread0.kernel_start_pointer = kernel >> 6;
3052	wm->thread0.grf_reg_count = GEN4_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
3053
3054	wm->thread1.single_program_flow = 0;
3055
3056	wm->thread3.const_urb_entry_read_length = 0;
3057	wm->thread3.const_urb_entry_read_offset = 0;
3058
3059	wm->thread3.urb_entry_read_offset = 0;
3060	wm->thread3.dispatch_grf_start_reg = 3;
3061
3062	assert((sampler & 31) == 0);
3063	wm->wm4.sampler_state_pointer = sampler >> 5;
3064	wm->wm4.sampler_count = 1;
3065
3066	wm->wm5.max_threads = gen >= 045 ? G4X_MAX_WM_THREADS - 1 : GEN4_MAX_WM_THREADS - 1;
3067	wm->wm5.transposed_urb_read = 0;
3068	wm->wm5.thread_dispatch_enable = 1;
3069	/* just use 16-pixel dispatch (4 subspans), don't need to change kernel
3070	 * start point
3071	 */
3072	wm->wm5.enable_16_pix = 1;
3073	wm->wm5.enable_8_pix = 0;
3074	wm->wm5.early_depth_test = 1;
3075
3076	/* Each pair of attributes (src/mask coords) is two URB entries */
3077	if (has_mask) {
3078		wm->thread1.binding_table_entry_count = 3;
3079		wm->thread3.urb_entry_read_length = 4;
3080	} else {
3081		wm->thread1.binding_table_entry_count = 2;
3082		wm->thread3.urb_entry_read_length = 2;
3083	}
3084}
3085
3086static uint32_t gen4_create_cc_unit_state(struct sna_static_stream *stream)
3087{
3088	uint8_t *ptr, *base;
3089	int i, j;
3090
3091	base = ptr =
3092		sna_static_stream_map(stream,
3093				      GEN4_BLENDFACTOR_COUNT*GEN4_BLENDFACTOR_COUNT*64,
3094				      64);
3095
3096	for (i = 0; i < GEN4_BLENDFACTOR_COUNT; i++) {
3097		for (j = 0; j < GEN4_BLENDFACTOR_COUNT; j++) {
3098			struct gen4_cc_unit_state *state =
3099				(struct gen4_cc_unit_state *)ptr;
3100
3101			state->cc3.blend_enable =
3102				!(j == GEN4_BLENDFACTOR_ZERO && i == GEN4_BLENDFACTOR_ONE);
3103
3104			state->cc5.logicop_func = 0xc;	/* COPY */
3105			state->cc5.ia_blend_function = GEN4_BLENDFUNCTION_ADD;
3106
3107			/* Fill in alpha blend factors same as color, for the future. */
3108			state->cc5.ia_src_blend_factor = i;
3109			state->cc5.ia_dest_blend_factor = j;
3110
3111			state->cc6.blend_function = GEN4_BLENDFUNCTION_ADD;
3112			state->cc6.clamp_post_alpha_blend = 1;
3113			state->cc6.clamp_pre_alpha_blend = 1;
3114			state->cc6.src_blend_factor = i;
3115			state->cc6.dest_blend_factor = j;
3116
3117			ptr += 64;
3118		}
3119	}
3120
3121	return sna_static_stream_offsetof(stream, base);
3122}
3123
3124static bool gen4_render_setup(struct sna *sna)
3125{
3126	struct gen4_render_state *state = &sna->render_state.gen4;
3127	struct sna_static_stream general;
3128	struct gen4_wm_unit_state_padded *wm_state;
3129	uint32_t sf, wm[KERNEL_COUNT];
3130	int i, j, k, l, m;
3131
3132	sna_static_stream_init(&general);
3133
3134	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
3135	 * dumps, you know it points to zero.
3136	 */
3137	null_create(&general);
3138
3139	sf = sna_static_stream_compile_sf(sna, &general, brw_sf_kernel__mask);
3140	for (m = 0; m < KERNEL_COUNT; m++) {
3141		if (wm_kernels[m].size) {
3142			wm[m] = sna_static_stream_add(&general,
3143						      wm_kernels[m].data,
3144						      wm_kernels[m].size,
3145						      64);
3146		} else {
3147			wm[m] = sna_static_stream_compile_wm(sna, &general,
3148							     wm_kernels[m].data,
3149							     16);
3150		}
3151	}
3152
3153	state->vs = gen4_create_vs_unit_state(&general);
3154	state->sf = gen4_create_sf_state(&general, sf);
3155
3156	wm_state = sna_static_stream_map(&general,
3157					  sizeof(*wm_state) * KERNEL_COUNT *
3158					  FILTER_COUNT * EXTEND_COUNT *
3159					  FILTER_COUNT * EXTEND_COUNT,
3160					  64);
3161	state->wm = sna_static_stream_offsetof(&general, wm_state);
3162	for (i = 0; i < FILTER_COUNT; i++) {
3163		for (j = 0; j < EXTEND_COUNT; j++) {
3164			for (k = 0; k < FILTER_COUNT; k++) {
3165				for (l = 0; l < EXTEND_COUNT; l++) {
3166					uint32_t sampler_state;
3167
3168					sampler_state =
3169						gen4_create_sampler_state(&general,
3170									  i, j,
3171									  k, l);
3172
3173					for (m = 0; m < KERNEL_COUNT; m++) {
3174						gen4_init_wm_state(&wm_state->state,
3175								   sna->kgem.gen,
3176								   wm_kernels[m].has_mask,
3177								   wm[m], sampler_state);
3178						wm_state++;
3179					}
3180				}
3181			}
3182		}
3183	}
3184
3185	state->cc = gen4_create_cc_unit_state(&general);
3186
3187	state->general_bo = sna_static_stream_fini(sna, &general);
3188	return state->general_bo != NULL;
3189}
3190
3191const char *gen4_render_init(struct sna *sna, const char *backend)
3192{
3193	if (!gen4_render_setup(sna))
3194		return backend;
3195
3196	sna->kgem.retire = gen4_render_retire;
3197	sna->kgem.expire = gen4_render_expire;
3198
3199#if !NO_COMPOSITE
3200	sna->render.composite = gen4_render_composite;
3201	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
3202#endif
3203#if !NO_COMPOSITE_SPANS
3204	sna->render.check_composite_spans = gen4_check_composite_spans;
3205	sna->render.composite_spans = gen4_render_composite_spans;
3206	if (0)
3207		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
3208#endif
3209
3210#if !NO_VIDEO
3211	sna->render.video = gen4_render_video;
3212#endif
3213
3214#if !NO_COPY_BOXES
3215	sna->render.copy_boxes = gen4_render_copy_boxes;
3216#endif
3217#if !NO_COPY
3218	sna->render.copy = gen4_render_copy;
3219#endif
3220
3221#if !NO_FILL_BOXES
3222	sna->render.fill_boxes = gen4_render_fill_boxes;
3223#endif
3224#if !NO_FILL
3225	sna->render.fill = gen4_render_fill;
3226#endif
3227#if !NO_FILL_ONE
3228	sna->render.fill_one = gen4_render_fill_one;
3229#endif
3230
3231	sna->render.flush = gen4_render_flush;
3232	sna->render.reset = gen4_render_reset;
3233	sna->render.fini = gen4_render_fini;
3234
3235	sna->render.max_3d_size = GEN4_MAX_3D_SIZE;
3236	sna->render.max_3d_pitch = 1 << 18;
3237	return sna->kgem.gen >= 045 ? "Eaglelake (gen4.5)" : "Broadwater (gen4)";
3238}
3239