r6xx_accel.c revision c73da4db
1/*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors: Alex Deucher <alexander.deucher@amd.com>
24 *          Matthias Hopf <mhopf@suse.de>
25 */
26#ifdef HAVE_CONFIG_H
27#include "config.h"
28#endif
29
30#include "xf86.h"
31
32#include <errno.h>
33
34#include "radeon.h"
35#include "r600_shader.h"
36#include "radeon_reg.h"
37#include "r600_reg.h"
38#include "r600_state.h"
39
40#include "radeon_drm.h"
41#include "radeon_vbo.h"
42#include "radeon_exa_shared.h"
43
44static const uint32_t R600_ROP[16] = {
45    RADEON_ROP3_ZERO, /* GXclear        */
46    RADEON_ROP3_DSa,  /* Gxand          */
47    RADEON_ROP3_SDna, /* GXandReverse   */
48    RADEON_ROP3_S,    /* GXcopy         */
49    RADEON_ROP3_DSna, /* GXandInverted  */
50    RADEON_ROP3_D,    /* GXnoop         */
51    RADEON_ROP3_DSx,  /* GXxor          */
52    RADEON_ROP3_DSo,  /* GXor           */
53    RADEON_ROP3_DSon, /* GXnor          */
54    RADEON_ROP3_DSxn, /* GXequiv        */
55    RADEON_ROP3_Dn,   /* GXinvert       */
56    RADEON_ROP3_SDno, /* GXorReverse    */
57    RADEON_ROP3_Sn,   /* GXcopyInverted */
58    RADEON_ROP3_DSno, /* GXorInverted   */
59    RADEON_ROP3_DSan, /* GXnand         */
60    RADEON_ROP3_ONE,  /* GXset          */
61};
62
63/* we try and batch operations together under KMS -
64   but it doesn't work yet without misrendering */
65#define KMS_MULTI_OP 1
66
67/* Flush the indirect buffer to the kernel for submission to the card */
68void R600CPFlushIndirect(ScrnInfoPtr pScrn, drmBufPtr ib)
69{
70    RADEONInfoPtr  info = RADEONPTR(pScrn);
71    drmBufPtr          buffer = ib;
72    int                start  = 0;
73    drm_radeon_indirect_t  indirect;
74
75#if defined(XF86DRM_MODE)
76    if (info->cs) {
77	radeon_cs_flush_indirect(pScrn);
78	return;
79    }
80#endif
81
82    if (!buffer) return;
83
84    //xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Flushing buffer %d\n",
85    //       buffer->idx);
86
87    while (buffer->used & 0x3c){
88	BEGIN_BATCH(1);
89        E32(buffer, CP_PACKET2()); /* fill up to multiple of 16 dwords */
90	END_BATCH();
91    }
92
93    info->accel_state->vbo.vb_offset = 0;
94    info->accel_state->vbo.vb_start_op = -1;
95
96    //ErrorF("buffer bytes: %d\n", buffer->used);
97
98    indirect.idx     = buffer->idx;
99    indirect.start   = start;
100    indirect.end     = buffer->used;
101    indirect.discard = 1;
102
103    drmCommandWriteRead(info->dri->drmFD, DRM_RADEON_INDIRECT,
104			&indirect, sizeof(drm_radeon_indirect_t));
105
106}
107
108void R600IBDiscard(ScrnInfoPtr pScrn, drmBufPtr ib)
109{
110#if defined(XF86DRM_MODE)
111    RADEONInfoPtr info = RADEONPTR(pScrn);
112    if (info->cs) {
113        radeon_ib_discard(pScrn);
114    }
115#endif
116    if (!ib) return;
117
118    ib->used = 0;
119    R600CPFlushIndirect(pScrn, ib);
120}
121
122void
123r600_wait_3d_idle_clean(ScrnInfoPtr pScrn, drmBufPtr ib)
124{
125    RADEONInfoPtr info = RADEONPTR(pScrn);
126
127    //flush caches, don't generate timestamp
128    BEGIN_BATCH(5);
129    PACK3(ib, IT_EVENT_WRITE, 1);
130    E32(ib, CACHE_FLUSH_AND_INV_EVENT);
131    // wait for 3D idle clean
132    EREG(ib, WAIT_UNTIL,                          (WAIT_3D_IDLE_bit |
133						   WAIT_3D_IDLECLEAN_bit));
134    END_BATCH();
135}
136
137void
138r600_wait_3d_idle(ScrnInfoPtr pScrn, drmBufPtr ib)
139{
140    RADEONInfoPtr info = RADEONPTR(pScrn);
141
142    BEGIN_BATCH(3);
143    EREG(ib, WAIT_UNTIL,                          WAIT_3D_IDLE_bit);
144    END_BATCH();
145}
146
147void
148r600_start_3d(ScrnInfoPtr pScrn, drmBufPtr ib)
149{
150    RADEONInfoPtr info = RADEONPTR(pScrn);
151
152    if (info->ChipFamily < CHIP_FAMILY_RV770) {
153	BEGIN_BATCH(5);
154	PACK3(ib, IT_START_3D_CMDBUF, 1);
155	E32(ib, 0);
156    } else
157	BEGIN_BATCH(3);
158
159    PACK3(ib, IT_CONTEXT_CONTROL, 2);
160    E32(ib, 0x80000000);
161    E32(ib, 0x80000000);
162    END_BATCH();
163
164}
165
166/*
167 * Setup of functional groups
168 */
169
170// asic stack/thread/gpr limits - need to query the drm
171static void
172r600_sq_setup(ScrnInfoPtr pScrn, drmBufPtr ib, sq_config_t *sq_conf)
173{
174    uint32_t sq_config, sq_gpr_resource_mgmt_1, sq_gpr_resource_mgmt_2;
175    uint32_t sq_thread_resource_mgmt, sq_stack_resource_mgmt_1, sq_stack_resource_mgmt_2;
176    RADEONInfoPtr info = RADEONPTR(pScrn);
177
178    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
179	(info->ChipFamily == CHIP_FAMILY_RV620) ||
180	(info->ChipFamily == CHIP_FAMILY_RS780) ||
181	(info->ChipFamily == CHIP_FAMILY_RS880) ||
182	(info->ChipFamily == CHIP_FAMILY_RV710))
183	sq_config = 0;						// no VC
184    else
185	sq_config = VC_ENABLE_bit;
186
187    sq_config |= (DX9_CONSTS_bit |
188		  ALU_INST_PREFER_VECTOR_bit |
189		  (sq_conf->ps_prio << PS_PRIO_shift) |
190		  (sq_conf->vs_prio << VS_PRIO_shift) |
191		  (sq_conf->gs_prio << GS_PRIO_shift) |
192		  (sq_conf->es_prio << ES_PRIO_shift));
193
194    sq_gpr_resource_mgmt_1 = ((sq_conf->num_ps_gprs << NUM_PS_GPRS_shift) |
195			      (sq_conf->num_vs_gprs << NUM_VS_GPRS_shift) |
196			      (sq_conf->num_temp_gprs << NUM_CLAUSE_TEMP_GPRS_shift));
197    sq_gpr_resource_mgmt_2 = ((sq_conf->num_gs_gprs << NUM_GS_GPRS_shift) |
198			      (sq_conf->num_es_gprs << NUM_ES_GPRS_shift));
199
200    sq_thread_resource_mgmt = ((sq_conf->num_ps_threads << NUM_PS_THREADS_shift) |
201			       (sq_conf->num_vs_threads << NUM_VS_THREADS_shift) |
202			       (sq_conf->num_gs_threads << NUM_GS_THREADS_shift) |
203			       (sq_conf->num_es_threads << NUM_ES_THREADS_shift));
204
205    sq_stack_resource_mgmt_1 = ((sq_conf->num_ps_stack_entries << NUM_PS_STACK_ENTRIES_shift) |
206				(sq_conf->num_vs_stack_entries << NUM_VS_STACK_ENTRIES_shift));
207
208    sq_stack_resource_mgmt_2 = ((sq_conf->num_gs_stack_entries << NUM_GS_STACK_ENTRIES_shift) |
209				(sq_conf->num_es_stack_entries << NUM_ES_STACK_ENTRIES_shift));
210
211    BEGIN_BATCH(8);
212    PACK0(ib, SQ_CONFIG, 6);
213    E32(ib, sq_config);
214    E32(ib, sq_gpr_resource_mgmt_1);
215    E32(ib, sq_gpr_resource_mgmt_2);
216    E32(ib, sq_thread_resource_mgmt);
217    E32(ib, sq_stack_resource_mgmt_1);
218    E32(ib, sq_stack_resource_mgmt_2);
219    END_BATCH();
220}
221
222void
223r600_set_render_target(ScrnInfoPtr pScrn, drmBufPtr ib, cb_config_t *cb_conf, uint32_t domain)
224{
225    uint32_t cb_color_info, cb_color_control;
226    unsigned pitch, slice, h, array_mode;
227    RADEONInfoPtr info = RADEONPTR(pScrn);
228
229
230#if defined(XF86DRM_MODE)
231    if (info->cs && cb_conf->surface) {
232	switch (cb_conf->surface->level[0].mode) {
233	case RADEON_SURF_MODE_1D:
234		array_mode = 2;
235		break;
236	case RADEON_SURF_MODE_2D:
237		array_mode = 4;
238		break;
239	default:
240		array_mode = 0;
241		break;
242	}
243	pitch = (cb_conf->surface->level[0].nblk_x >> 3) - 1;
244	slice = ((cb_conf->surface->level[0].nblk_x * cb_conf->surface->level[0].nblk_y) / 64) - 1;
245    } else
246#endif
247    {
248	array_mode = cb_conf->array_mode;
249	pitch = (cb_conf->w / 8) - 1;
250	h = RADEON_ALIGN(cb_conf->h, 8);
251	slice = ((cb_conf->w * h) / 64) - 1;
252    }
253
254    cb_color_info = ((cb_conf->endian      << ENDIAN_shift)				|
255		     (cb_conf->format      << CB_COLOR0_INFO__FORMAT_shift)		|
256		     (array_mode  << CB_COLOR0_INFO__ARRAY_MODE_shift)		|
257		     (cb_conf->number_type << NUMBER_TYPE_shift)			|
258		     (cb_conf->comp_swap   << COMP_SWAP_shift)				|
259		     (cb_conf->tile_mode   << CB_COLOR0_INFO__TILE_MODE_shift));
260    if (cb_conf->read_size)
261	cb_color_info |= CB_COLOR0_INFO__READ_SIZE_bit;
262    if (cb_conf->blend_clamp)
263	cb_color_info |= BLEND_CLAMP_bit;
264    if (cb_conf->clear_color)
265	cb_color_info |= CLEAR_COLOR_bit;
266    if (cb_conf->blend_bypass)
267	cb_color_info |= BLEND_BYPASS_bit;
268    if (cb_conf->blend_float32)
269	cb_color_info |= BLEND_FLOAT32_bit;
270    if (cb_conf->simple_float)
271	cb_color_info |= SIMPLE_FLOAT_bit;
272    if (cb_conf->round_mode)
273	cb_color_info |= CB_COLOR0_INFO__ROUND_MODE_bit;
274    if (cb_conf->tile_compact)
275	cb_color_info |= TILE_COMPACT_bit;
276    if (cb_conf->source_format)
277	cb_color_info |= SOURCE_FORMAT_bit;
278
279    BEGIN_BATCH(3 + 2);
280    EREG(ib, (CB_COLOR0_BASE + (4 * cb_conf->id)), (cb_conf->base >> 8));
281    RELOC_BATCH(cb_conf->bo, 0, domain);
282    END_BATCH();
283
284    // rv6xx workaround
285    if ((info->ChipFamily > CHIP_FAMILY_R600) &&
286        (info->ChipFamily < CHIP_FAMILY_RV770)) {
287        BEGIN_BATCH(2);
288        PACK3(ib, IT_SURFACE_BASE_UPDATE, 1);
289        E32(ib, (2 << cb_conf->id));
290        END_BATCH();
291    }
292    /* Set CMASK & TILE buffer to the offset of color buffer as
293     * we don't use those this shouldn't cause any issue and we
294     * then have a valid cmd stream
295     */
296    BEGIN_BATCH(3 + 2);
297    EREG(ib, (CB_COLOR0_TILE + (4 * cb_conf->id)), (0     >> 8));	// CMASK per-tile data base/256
298    RELOC_BATCH(cb_conf->bo, 0, domain);
299    END_BATCH();
300    BEGIN_BATCH(3 + 2);
301    EREG(ib, (CB_COLOR0_FRAG + (4 * cb_conf->id)), (0     >> 8));	// FMASK per-tile data base/256
302    RELOC_BATCH(cb_conf->bo, 0, domain);
303    END_BATCH();
304    BEGIN_BATCH(9);
305    // pitch only for ARRAY_LINEAR_GENERAL, other tiling modes require addrlib
306    EREG(ib, (CB_COLOR0_SIZE + (4 * cb_conf->id)), ((pitch << PITCH_TILE_MAX_shift)	|
307						    (slice << SLICE_TILE_MAX_shift)));
308    EREG(ib, (CB_COLOR0_VIEW + (4 * cb_conf->id)), ((0    << SLICE_START_shift)		|
309						    (0    << SLICE_MAX_shift)));
310    EREG(ib, (CB_COLOR0_MASK + (4 * cb_conf->id)), ((0    << CMASK_BLOCK_MAX_shift)	|
311						    (0    << FMASK_TILE_MAX_shift)));
312    END_BATCH();
313
314    BEGIN_BATCH(3 + 2);
315    EREG(ib, (CB_COLOR0_INFO + (4 * cb_conf->id)), cb_color_info);
316    RELOC_BATCH(cb_conf->bo, 0, domain);
317    END_BATCH();
318
319    BEGIN_BATCH(9);
320    EREG(ib, CB_TARGET_MASK,          (cb_conf->pmask << TARGET0_ENABLE_shift));
321    cb_color_control = R600_ROP[cb_conf->rop] |
322	(cb_conf->blend_enable << TARGET_BLEND_ENABLE_shift);
323    if (info->ChipFamily == CHIP_FAMILY_R600) {
324	/* no per-MRT blend on R600 */
325	EREG(ib, CB_COLOR_CONTROL,    cb_color_control);
326	EREG(ib, CB_BLEND_CONTROL,    cb_conf->blendcntl);
327    } else {
328	if (cb_conf->blend_enable)
329	    cb_color_control |= PER_MRT_BLEND_bit;
330	EREG(ib, CB_COLOR_CONTROL,    cb_color_control);
331	EREG(ib, CB_BLEND0_CONTROL,   cb_conf->blendcntl);
332    }
333    END_BATCH();
334}
335
336static void
337r600_cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type,
338			 uint32_t size, uint64_t mc_addr,
339			 struct radeon_bo *bo, uint32_t rdomains, uint32_t wdomain)
340{
341    RADEONInfoPtr info = RADEONPTR(pScrn);
342    uint32_t cp_coher_size;
343    if (size == 0xffffffff)
344	cp_coher_size = 0xffffffff;
345    else
346	cp_coher_size = ((size + 255) >> 8);
347
348    BEGIN_BATCH(5 + 2);
349    PACK3(ib, IT_SURFACE_SYNC, 4);
350    E32(ib, sync_type);
351    E32(ib, cp_coher_size);
352    E32(ib, (mc_addr >> 8));
353    E32(ib, 10); /* poll interval */
354    RELOC_BATCH(bo, rdomains, wdomain);
355    END_BATCH();
356}
357
358/* inserts a wait for vline in the command stream */
359void
360r600_cp_wait_vline_sync(ScrnInfoPtr pScrn, drmBufPtr ib, PixmapPtr pPix,
361			xf86CrtcPtr crtc, int start, int stop)
362{
363    RADEONInfoPtr  info = RADEONPTR(pScrn);
364    uint32_t offset;
365
366    if (!crtc)
367        return;
368
369    if (!crtc->enabled)
370        return;
371
372    if (info->cs) {
373        if (pPix != pScrn->pScreen->GetScreenPixmap(pScrn->pScreen))
374	    return;
375    } else {
376#ifdef USE_EXA
377	if (info->useEXA)
378	    offset = exaGetPixmapOffset(pPix);
379	else
380#endif
381	    offset = pPix->devPrivate.ptr - info->FB;
382
383	/* if drawing to front buffer */
384	if (offset != 0)
385	    return;
386    }
387
388    start = max(start, crtc->y);
389    stop = min(stop, crtc->y + crtc->mode.VDisplay);
390
391    if (start >= stop)
392        return;
393
394#if defined(XF86DRM_MODE)
395    if (info->cs) {
396	drmmode_crtc_private_ptr drmmode_crtc = crtc->driver_private;
397
398	BEGIN_BATCH(11);
399	/* set the VLINE range */
400	EREG(ib, AVIVO_D1MODE_VLINE_START_END, /* this is just a marker */
401	     (start << AVIVO_D1MODE_VLINE_START_SHIFT) |
402	     (stop << AVIVO_D1MODE_VLINE_END_SHIFT));
403
404	/* tell the CP to poll the VLINE state register */
405	PACK3(ib, IT_WAIT_REG_MEM, 6);
406	E32(ib, IT_WAIT_REG | IT_WAIT_EQ);
407	E32(ib, IT_WAIT_ADDR(AVIVO_D1MODE_VLINE_STATUS));
408	E32(ib, 0);
409	E32(ib, 0);                          // Ref value
410	E32(ib, AVIVO_D1MODE_VLINE_STAT);    // Mask
411	E32(ib, 10);                         // Wait interval
412	/* add crtc reloc */
413	PACK3(ib, IT_NOP, 1);
414	E32(ib, drmmode_crtc->mode_crtc->crtc_id);
415	END_BATCH();
416    } else
417#endif
418    {
419	RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private;
420
421	BEGIN_BATCH(9);
422	/* set the VLINE range */
423	EREG(ib, AVIVO_D1MODE_VLINE_START_END + radeon_crtc->crtc_offset,
424	     (start << AVIVO_D1MODE_VLINE_START_SHIFT) |
425	     (stop << AVIVO_D1MODE_VLINE_END_SHIFT));
426
427	/* tell the CP to poll the VLINE state register */
428	PACK3(ib, IT_WAIT_REG_MEM, 6);
429	E32(ib, IT_WAIT_REG | IT_WAIT_EQ);
430	E32(ib, IT_WAIT_ADDR(AVIVO_D1MODE_VLINE_STATUS + radeon_crtc->crtc_offset));
431	E32(ib, 0);
432	E32(ib, 0);                          // Ref value
433	E32(ib, AVIVO_D1MODE_VLINE_STAT);    // Mask
434	E32(ib, 10);                         // Wait interval
435	END_BATCH();
436    }
437}
438
439void
440r600_set_spi(ScrnInfoPtr pScrn, drmBufPtr ib, int vs_export_count, int num_interp)
441{
442    RADEONInfoPtr info = RADEONPTR(pScrn);
443
444    BEGIN_BATCH(8);
445    /* Interpolator setup */
446    EREG(ib, SPI_VS_OUT_CONFIG, (vs_export_count << VS_EXPORT_COUNT_shift));
447    PACK0(ib, SPI_PS_IN_CONTROL_0, 3);
448    E32(ib, (num_interp << NUM_INTERP_shift));
449    E32(ib, 0);
450    E32(ib, 0);
451    END_BATCH();
452}
453
454void
455r600_fs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *fs_conf, uint32_t domain)
456{
457    RADEONInfoPtr info = RADEONPTR(pScrn);
458    uint32_t sq_pgm_resources;
459
460    sq_pgm_resources = ((fs_conf->num_gprs << NUM_GPRS_shift) |
461			(fs_conf->stack_size << STACK_SIZE_shift));
462
463    if (fs_conf->dx10_clamp)
464	sq_pgm_resources |= SQ_PGM_RESOURCES_FS__DX10_CLAMP_bit;
465
466    BEGIN_BATCH(3 + 2);
467    EREG(ib, SQ_PGM_START_FS, fs_conf->shader_addr >> 8);
468    RELOC_BATCH(fs_conf->bo, domain, 0);
469    END_BATCH();
470
471    BEGIN_BATCH(6);
472    EREG(ib, SQ_PGM_RESOURCES_FS, sq_pgm_resources);
473    EREG(ib, SQ_PGM_CF_OFFSET_FS, 0);
474    END_BATCH();
475}
476
477void
478r600_vs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *vs_conf, uint32_t domain)
479{
480    RADEONInfoPtr info = RADEONPTR(pScrn);
481    uint32_t sq_pgm_resources;
482
483    sq_pgm_resources = ((vs_conf->num_gprs << NUM_GPRS_shift) |
484			(vs_conf->stack_size << STACK_SIZE_shift));
485
486    if (vs_conf->dx10_clamp)
487	sq_pgm_resources |= SQ_PGM_RESOURCES_VS__DX10_CLAMP_bit;
488    if (vs_conf->fetch_cache_lines)
489	sq_pgm_resources |= (vs_conf->fetch_cache_lines << FETCH_CACHE_LINES_shift);
490    if (vs_conf->uncached_first_inst)
491	sq_pgm_resources |= UNCACHED_FIRST_INST_bit;
492
493    /* flush SQ cache */
494    r600_cp_set_surface_sync(pScrn, ib, SH_ACTION_ENA_bit,
495			     vs_conf->shader_size, vs_conf->shader_addr,
496			     vs_conf->bo, domain, 0);
497
498    BEGIN_BATCH(3 + 2);
499    EREG(ib, SQ_PGM_START_VS, vs_conf->shader_addr >> 8);
500    RELOC_BATCH(vs_conf->bo, domain, 0);
501    END_BATCH();
502
503    BEGIN_BATCH(6);
504    EREG(ib, SQ_PGM_RESOURCES_VS, sq_pgm_resources);
505    EREG(ib, SQ_PGM_CF_OFFSET_VS, 0);
506    END_BATCH();
507}
508
509void
510r600_ps_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *ps_conf, uint32_t domain)
511{
512    RADEONInfoPtr info = RADEONPTR(pScrn);
513    uint32_t sq_pgm_resources;
514
515    sq_pgm_resources = ((ps_conf->num_gprs << NUM_GPRS_shift) |
516			(ps_conf->stack_size << STACK_SIZE_shift));
517
518    if (ps_conf->dx10_clamp)
519	sq_pgm_resources |= SQ_PGM_RESOURCES_PS__DX10_CLAMP_bit;
520    if (ps_conf->fetch_cache_lines)
521	sq_pgm_resources |= (ps_conf->fetch_cache_lines << FETCH_CACHE_LINES_shift);
522    if (ps_conf->uncached_first_inst)
523	sq_pgm_resources |= UNCACHED_FIRST_INST_bit;
524    if (ps_conf->clamp_consts)
525	sq_pgm_resources |= CLAMP_CONSTS_bit;
526
527    /* flush SQ cache */
528    r600_cp_set_surface_sync(pScrn, ib, SH_ACTION_ENA_bit,
529			     ps_conf->shader_size, ps_conf->shader_addr,
530			     ps_conf->bo, domain, 0);
531
532    BEGIN_BATCH(3 + 2);
533    EREG(ib, SQ_PGM_START_PS, ps_conf->shader_addr >> 8);
534    RELOC_BATCH(ps_conf->bo, domain, 0);
535    END_BATCH();
536
537    BEGIN_BATCH(9);
538    EREG(ib, SQ_PGM_RESOURCES_PS, sq_pgm_resources);
539    EREG(ib, SQ_PGM_EXPORTS_PS, ps_conf->export_mode);
540    EREG(ib, SQ_PGM_CF_OFFSET_PS, 0);
541    END_BATCH();
542}
543
544void
545r600_set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *const_buf)
546{
547    RADEONInfoPtr info = RADEONPTR(pScrn);
548    int i;
549    const int countreg = count * (SQ_ALU_CONSTANT_offset >> 2);
550
551    BEGIN_BATCH(2 + countreg);
552    PACK0(ib, SQ_ALU_CONSTANT + offset * SQ_ALU_CONSTANT_offset, countreg);
553    for (i = 0; i < countreg; i++)
554	EFLOAT(ib, const_buf[i]);
555    END_BATCH();
556}
557
558void
559r600_set_bool_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, uint32_t val)
560{
561    RADEONInfoPtr info = RADEONPTR(pScrn);
562    /* bool register order is: ps, vs, gs; one register each
563     * 1 bits per bool; 32 bools each for ps, vs, gs.
564     */
565    BEGIN_BATCH(3);
566    EREG(ib, SQ_BOOL_CONST + offset * SQ_BOOL_CONST_offset, val);
567    END_BATCH();
568}
569
570static void
571r600_set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res, uint32_t domain)
572{
573    RADEONInfoPtr info = RADEONPTR(pScrn);
574    struct radeon_accel_state *accel_state = info->accel_state;
575    uint32_t sq_vtx_constant_word2;
576
577    sq_vtx_constant_word2 = ((((res->vb_addr) >> 32) & BASE_ADDRESS_HI_mask) |
578			     ((res->vtx_size_dw << 2) << SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift) |
579			     (res->format << SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_shift) |
580			     (res->num_format_all << SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift) |
581			     (res->endian << SQ_VTX_CONSTANT_WORD2_0__ENDIAN_SWAP_shift));
582    if (res->clamp_x)
583	    sq_vtx_constant_word2 |= SQ_VTX_CONSTANT_WORD2_0__CLAMP_X_bit;
584
585    if (res->format_comp_all)
586	    sq_vtx_constant_word2 |= SQ_VTX_CONSTANT_WORD2_0__FORMAT_COMP_ALL_bit;
587
588    if (res->srf_mode_all)
589	    sq_vtx_constant_word2 |= SQ_VTX_CONSTANT_WORD2_0__SRF_MODE_ALL_bit;
590
591    /* flush vertex cache */
592    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
593	(info->ChipFamily == CHIP_FAMILY_RV620) ||
594	(info->ChipFamily == CHIP_FAMILY_RS780) ||
595	(info->ChipFamily == CHIP_FAMILY_RS880) ||
596	(info->ChipFamily == CHIP_FAMILY_RV710))
597	r600_cp_set_surface_sync(pScrn, ib, TC_ACTION_ENA_bit,
598				 accel_state->vbo.vb_offset, accel_state->vbo.vb_mc_addr,
599				 res->bo,
600				 domain, 0);
601    else
602	r600_cp_set_surface_sync(pScrn, ib, VC_ACTION_ENA_bit,
603				 accel_state->vbo.vb_offset, accel_state->vbo.vb_mc_addr,
604				 res->bo,
605				 domain, 0);
606
607    BEGIN_BATCH(9 + 2);
608    PACK0(ib, SQ_VTX_RESOURCE + res->id * SQ_VTX_RESOURCE_offset, 7);
609    E32(ib, res->vb_addr & 0xffffffff);				// 0: BASE_ADDRESS
610    E32(ib, (res->vtx_num_entries << 2) - 1);			// 1: SIZE
611    E32(ib, sq_vtx_constant_word2);	// 2: BASE_HI, STRIDE, CLAMP, FORMAT, ENDIAN
612    E32(ib, res->mem_req_size << MEM_REQUEST_SIZE_shift);		// 3: MEM_REQUEST_SIZE ?!?
613    E32(ib, 0);							// 4: n/a
614    E32(ib, 0);							// 5: n/a
615    E32(ib, SQ_TEX_VTX_VALID_BUFFER << SQ_VTX_CONSTANT_WORD6_0__TYPE_shift);	// 6: TYPE
616    RELOC_BATCH(res->bo, domain, 0);
617    END_BATCH();
618}
619
620void
621r600_set_tex_resource(ScrnInfoPtr pScrn, drmBufPtr ib, tex_resource_t *tex_res, uint32_t domain)
622{
623    RADEONInfoPtr info = RADEONPTR(pScrn);
624    uint32_t sq_tex_resource_word0, sq_tex_resource_word1, sq_tex_resource_word4;
625    uint32_t sq_tex_resource_word5, sq_tex_resource_word6;
626    uint32_t array_mode, pitch;
627
628#if defined(XF86DRM_MODE)
629    if (info->cs && tex_res->surface) {
630	switch (tex_res->surface->level[0].mode) {
631	case RADEON_SURF_MODE_1D:
632		array_mode = 2;
633		break;
634	case RADEON_SURF_MODE_2D:
635		array_mode = 4;
636		break;
637	default:
638		array_mode = 0;
639		break;
640	}
641	pitch = tex_res->surface->level[0].nblk_x >> 3;
642    } else
643#endif
644    {
645	array_mode = tex_res->tile_mode;
646	pitch = (tex_res->pitch + 7) >> 3;
647    }
648
649    sq_tex_resource_word0 = ((tex_res->dim << DIM_shift) |
650		     (array_mode << SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift));
651
652    if (tex_res->w)
653	sq_tex_resource_word0 |= (((pitch - 1) << PITCH_shift) |
654				  ((tex_res->w - 1) << TEX_WIDTH_shift));
655
656    if (tex_res->tile_type)
657	sq_tex_resource_word0 |= TILE_TYPE_bit;
658
659    sq_tex_resource_word1 = (tex_res->format << SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift);
660
661    if (tex_res->h)
662	sq_tex_resource_word1 |= ((tex_res->h - 1) << TEX_HEIGHT_shift);
663    if (tex_res->depth)
664	sq_tex_resource_word1 |= ((tex_res->depth - 1) << TEX_DEPTH_shift);
665
666    sq_tex_resource_word4 = ((tex_res->format_comp_x << FORMAT_COMP_X_shift) |
667			     (tex_res->format_comp_y << FORMAT_COMP_Y_shift) |
668			     (tex_res->format_comp_z << FORMAT_COMP_Z_shift) |
669			     (tex_res->format_comp_w << FORMAT_COMP_W_shift) |
670			     (tex_res->num_format_all << SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_shift) |
671			     (tex_res->endian << SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_shift) |
672			     (tex_res->request_size << REQUEST_SIZE_shift) |
673			     (tex_res->dst_sel_x << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift) |
674			     (tex_res->dst_sel_y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift) |
675			     (tex_res->dst_sel_z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift) |
676			     (tex_res->dst_sel_w << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift) |
677			     (tex_res->base_level << BASE_LEVEL_shift));
678
679    if (tex_res->srf_mode_all)
680	sq_tex_resource_word4 |= SQ_TEX_RESOURCE_WORD4_0__SRF_MODE_ALL_bit;
681    if (tex_res->force_degamma)
682	sq_tex_resource_word4 |= SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit;
683
684    sq_tex_resource_word5 = ((tex_res->last_level << LAST_LEVEL_shift) |
685			     (tex_res->base_array << BASE_ARRAY_shift) |
686			     (tex_res->last_array << LAST_ARRAY_shift));
687
688    sq_tex_resource_word6 = ((tex_res->mpeg_clamp << MPEG_CLAMP_shift) |
689			     (tex_res->perf_modulation << PERF_MODULATION_shift) |
690			     (SQ_TEX_VTX_VALID_TEXTURE << SQ_TEX_RESOURCE_WORD6_0__TYPE_shift));
691
692    if (tex_res->interlaced)
693	sq_tex_resource_word6 |= INTERLACED_bit;
694
695    /* flush texture cache */
696    r600_cp_set_surface_sync(pScrn, ib, TC_ACTION_ENA_bit,
697			     tex_res->size, tex_res->base,
698			     tex_res->bo, domain, 0);
699
700    BEGIN_BATCH(9 + 4);
701    PACK0(ib, SQ_TEX_RESOURCE + tex_res->id * SQ_TEX_RESOURCE_offset, 7);
702    E32(ib, sq_tex_resource_word0);
703    E32(ib, sq_tex_resource_word1);
704    E32(ib, ((tex_res->base) >> 8));
705    E32(ib, ((tex_res->mip_base) >> 8));
706    E32(ib, sq_tex_resource_word4);
707    E32(ib, sq_tex_resource_word5);
708    E32(ib, sq_tex_resource_word6);
709    RELOC_BATCH(tex_res->bo, domain, 0);
710    RELOC_BATCH(tex_res->mip_bo, domain, 0);
711    END_BATCH();
712}
713
714void
715r600_set_tex_sampler (ScrnInfoPtr pScrn, drmBufPtr ib, tex_sampler_t *s)
716{
717    RADEONInfoPtr info = RADEONPTR(pScrn);
718    uint32_t sq_tex_sampler_word0, sq_tex_sampler_word1, sq_tex_sampler_word2;
719
720    sq_tex_sampler_word0 = ((s->clamp_x       << SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift)		|
721			    (s->clamp_y       << CLAMP_Y_shift)					|
722			    (s->clamp_z       << CLAMP_Z_shift)					|
723			    (s->xy_mag_filter << XY_MAG_FILTER_shift)				|
724			    (s->xy_min_filter << XY_MIN_FILTER_shift)				|
725			    (s->z_filter      << Z_FILTER_shift)	|
726			    (s->mip_filter    << MIP_FILTER_shift)				|
727			    (s->border_color  << BORDER_COLOR_TYPE_shift)			|
728			    (s->depth_compare << DEPTH_COMPARE_FUNCTION_shift)			|
729			    (s->chroma_key    << CHROMA_KEY_shift));
730    if (s->point_sampling_clamp)
731	sq_tex_sampler_word0 |= POINT_SAMPLING_CLAMP_bit;
732    if (s->tex_array_override)
733	sq_tex_sampler_word0 |= TEX_ARRAY_OVERRIDE_bit;
734    if (s->lod_uses_minor_axis)
735	sq_tex_sampler_word0 |= LOD_USES_MINOR_AXIS_bit;
736
737    sq_tex_sampler_word1 = ((s->min_lod       << MIN_LOD_shift)					|
738			    (s->max_lod       << MAX_LOD_shift)					|
739			    (s->lod_bias      << SQ_TEX_SAMPLER_WORD1_0__LOD_BIAS_shift));
740
741    sq_tex_sampler_word2 = ((s->lod_bias2     << LOD_BIAS_SEC_shift)	|
742			    (s->perf_mip      << PERF_MIP_shift)	|
743			    (s->perf_z        << PERF_Z_shift));
744    if (s->mc_coord_truncate)
745	sq_tex_sampler_word2 |= MC_COORD_TRUNCATE_bit;
746    if (s->force_degamma)
747	sq_tex_sampler_word2 |= SQ_TEX_SAMPLER_WORD2_0__FORCE_DEGAMMA_bit;
748    if (s->high_precision_filter)
749	sq_tex_sampler_word2 |= HIGH_PRECISION_FILTER_bit;
750    if (s->fetch_4)
751	sq_tex_sampler_word2 |= FETCH_4_bit;
752    if (s->sample_is_pcf)
753	sq_tex_sampler_word2 |= SAMPLE_IS_PCF_bit;
754    if (s->type)
755	sq_tex_sampler_word2 |= SQ_TEX_SAMPLER_WORD2_0__TYPE_bit;
756
757    BEGIN_BATCH(5);
758    PACK0(ib, SQ_TEX_SAMPLER_WORD + s->id * SQ_TEX_SAMPLER_WORD_offset, 3);
759    E32(ib, sq_tex_sampler_word0);
760    E32(ib, sq_tex_sampler_word1);
761    E32(ib, sq_tex_sampler_word2);
762    END_BATCH();
763}
764
765//XXX deal with clip offsets in clip setup
766void
767r600_set_screen_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
768{
769    RADEONInfoPtr info = RADEONPTR(pScrn);
770
771    BEGIN_BATCH(4);
772    PACK0(ib, PA_SC_SCREEN_SCISSOR_TL, 2);
773    E32(ib, ((x1 << PA_SC_SCREEN_SCISSOR_TL__TL_X_shift) |
774	     (y1 << PA_SC_SCREEN_SCISSOR_TL__TL_Y_shift)));
775    E32(ib, ((x2 << PA_SC_SCREEN_SCISSOR_BR__BR_X_shift) |
776	     (y2 << PA_SC_SCREEN_SCISSOR_BR__BR_Y_shift)));
777    END_BATCH();
778}
779
780void
781r600_set_vport_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2)
782{
783    RADEONInfoPtr info = RADEONPTR(pScrn);
784
785    BEGIN_BATCH(4);
786    PACK0(ib, PA_SC_VPORT_SCISSOR_0_TL + id * PA_SC_VPORT_SCISSOR_0_TL_offset, 2);
787    E32(ib, ((x1 << PA_SC_VPORT_SCISSOR_0_TL__TL_X_shift) |
788	     (y1 << PA_SC_VPORT_SCISSOR_0_TL__TL_Y_shift) |
789	     WINDOW_OFFSET_DISABLE_bit));
790    E32(ib, ((x2 << PA_SC_VPORT_SCISSOR_0_BR__BR_X_shift) |
791	     (y2 << PA_SC_VPORT_SCISSOR_0_BR__BR_Y_shift)));
792    END_BATCH();
793}
794
795void
796r600_set_generic_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
797{
798    RADEONInfoPtr info = RADEONPTR(pScrn);
799
800    BEGIN_BATCH(4);
801    PACK0(ib, PA_SC_GENERIC_SCISSOR_TL, 2);
802    E32(ib, ((x1 << PA_SC_GENERIC_SCISSOR_TL__TL_X_shift) |
803	     (y1 << PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift) |
804	     WINDOW_OFFSET_DISABLE_bit));
805    E32(ib, ((x2 << PA_SC_GENERIC_SCISSOR_BR__BR_X_shift) |
806	     (y2 << PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift)));
807    END_BATCH();
808}
809
810void
811r600_set_window_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
812{
813    RADEONInfoPtr info = RADEONPTR(pScrn);
814
815    BEGIN_BATCH(4);
816    PACK0(ib, PA_SC_WINDOW_SCISSOR_TL, 2);
817    E32(ib, ((x1 << PA_SC_WINDOW_SCISSOR_TL__TL_X_shift) |
818	     (y1 << PA_SC_WINDOW_SCISSOR_TL__TL_Y_shift) |
819	     WINDOW_OFFSET_DISABLE_bit));
820    E32(ib, ((x2 << PA_SC_WINDOW_SCISSOR_BR__BR_X_shift) |
821	      (y2 << PA_SC_WINDOW_SCISSOR_BR__BR_Y_shift)));
822    END_BATCH();
823}
824
825void
826r600_set_clip_rect(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2)
827{
828    RADEONInfoPtr info = RADEONPTR(pScrn);
829
830    BEGIN_BATCH(4);
831    PACK0(ib, PA_SC_CLIPRECT_0_TL + id * PA_SC_CLIPRECT_0_TL_offset, 2);
832    E32(ib, ((x1 << PA_SC_CLIPRECT_0_TL__TL_X_shift) |
833	     (y1 << PA_SC_CLIPRECT_0_TL__TL_Y_shift)));
834    E32(ib, ((x2 << PA_SC_CLIPRECT_0_BR__BR_X_shift) |
835	     (y2 << PA_SC_CLIPRECT_0_BR__BR_Y_shift)));
836    END_BATCH();
837}
838
839/*
840 * Setup of default state
841 */
842
843void
844r600_set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
845{
846    tex_resource_t tex_res;
847    shader_config_t fs_conf;
848    sq_config_t sq_conf;
849    int i;
850    RADEONInfoPtr info = RADEONPTR(pScrn);
851    struct radeon_accel_state *accel_state = info->accel_state;
852
853    if (accel_state->XInited3D)
854	return;
855
856    memset(&tex_res, 0, sizeof(tex_resource_t));
857    memset(&fs_conf, 0, sizeof(shader_config_t));
858
859    accel_state->XInited3D = TRUE;
860
861    r600_start_3d(pScrn, accel_state->ib);
862
863    // SQ
864    sq_conf.ps_prio = 0;
865    sq_conf.vs_prio = 1;
866    sq_conf.gs_prio = 2;
867    sq_conf.es_prio = 3;
868    // need to set stack/thread/gpr limits based on the asic
869    // for now just set them low enough so any card will work
870    // see r600_cp.c in the drm
871    switch (info->ChipFamily) {
872    case CHIP_FAMILY_R600:
873	sq_conf.num_ps_gprs = 192;
874	sq_conf.num_vs_gprs = 56;
875	sq_conf.num_temp_gprs = 4;
876	sq_conf.num_gs_gprs = 0;
877	sq_conf.num_es_gprs = 0;
878	sq_conf.num_ps_threads = 136;
879	sq_conf.num_vs_threads = 48;
880	sq_conf.num_gs_threads = 4;
881	sq_conf.num_es_threads = 4;
882	sq_conf.num_ps_stack_entries = 128;
883	sq_conf.num_vs_stack_entries = 128;
884	sq_conf.num_gs_stack_entries = 0;
885	sq_conf.num_es_stack_entries = 0;
886	break;
887    case CHIP_FAMILY_RV630:
888    case CHIP_FAMILY_RV635:
889	sq_conf.num_ps_gprs = 84;
890	sq_conf.num_vs_gprs = 36;
891	sq_conf.num_temp_gprs = 4;
892	sq_conf.num_gs_gprs = 0;
893	sq_conf.num_es_gprs = 0;
894	sq_conf.num_ps_threads = 144;
895	sq_conf.num_vs_threads = 40;
896	sq_conf.num_gs_threads = 4;
897	sq_conf.num_es_threads = 4;
898	sq_conf.num_ps_stack_entries = 40;
899	sq_conf.num_vs_stack_entries = 40;
900	sq_conf.num_gs_stack_entries = 32;
901	sq_conf.num_es_stack_entries = 16;
902	break;
903    case CHIP_FAMILY_RV610:
904    case CHIP_FAMILY_RV620:
905    case CHIP_FAMILY_RS780:
906    case CHIP_FAMILY_RS880:
907    default:
908	sq_conf.num_ps_gprs = 84;
909	sq_conf.num_vs_gprs = 36;
910	sq_conf.num_temp_gprs = 4;
911	sq_conf.num_gs_gprs = 0;
912	sq_conf.num_es_gprs = 0;
913	sq_conf.num_ps_threads = 136;
914	sq_conf.num_vs_threads = 48;
915	sq_conf.num_gs_threads = 4;
916	sq_conf.num_es_threads = 4;
917	sq_conf.num_ps_stack_entries = 40;
918	sq_conf.num_vs_stack_entries = 40;
919	sq_conf.num_gs_stack_entries = 32;
920	sq_conf.num_es_stack_entries = 16;
921	break;
922    case CHIP_FAMILY_RV670:
923	sq_conf.num_ps_gprs = 144;
924	sq_conf.num_vs_gprs = 40;
925	sq_conf.num_temp_gprs = 4;
926	sq_conf.num_gs_gprs = 0;
927	sq_conf.num_es_gprs = 0;
928	sq_conf.num_ps_threads = 136;
929	sq_conf.num_vs_threads = 48;
930	sq_conf.num_gs_threads = 4;
931	sq_conf.num_es_threads = 4;
932	sq_conf.num_ps_stack_entries = 40;
933	sq_conf.num_vs_stack_entries = 40;
934	sq_conf.num_gs_stack_entries = 32;
935	sq_conf.num_es_stack_entries = 16;
936	break;
937    case CHIP_FAMILY_RV770:
938	sq_conf.num_ps_gprs = 192;
939	sq_conf.num_vs_gprs = 56;
940	sq_conf.num_temp_gprs = 4;
941	sq_conf.num_gs_gprs = 0;
942	sq_conf.num_es_gprs = 0;
943	sq_conf.num_ps_threads = 188;
944	sq_conf.num_vs_threads = 60;
945	sq_conf.num_gs_threads = 0;
946	sq_conf.num_es_threads = 0;
947	sq_conf.num_ps_stack_entries = 256;
948	sq_conf.num_vs_stack_entries = 256;
949	sq_conf.num_gs_stack_entries = 0;
950	sq_conf.num_es_stack_entries = 0;
951	break;
952    case CHIP_FAMILY_RV730:
953    case CHIP_FAMILY_RV740:
954	sq_conf.num_ps_gprs = 84;
955	sq_conf.num_vs_gprs = 36;
956	sq_conf.num_temp_gprs = 4;
957	sq_conf.num_gs_gprs = 0;
958	sq_conf.num_es_gprs = 0;
959	sq_conf.num_ps_threads = 188;
960	sq_conf.num_vs_threads = 60;
961	sq_conf.num_gs_threads = 0;
962	sq_conf.num_es_threads = 0;
963	sq_conf.num_ps_stack_entries = 128;
964	sq_conf.num_vs_stack_entries = 128;
965	sq_conf.num_gs_stack_entries = 0;
966	sq_conf.num_es_stack_entries = 0;
967	break;
968    case CHIP_FAMILY_RV710:
969	sq_conf.num_ps_gprs = 192;
970	sq_conf.num_vs_gprs = 56;
971	sq_conf.num_temp_gprs = 4;
972	sq_conf.num_gs_gprs = 0;
973	sq_conf.num_es_gprs = 0;
974	sq_conf.num_ps_threads = 144;
975	sq_conf.num_vs_threads = 48;
976	sq_conf.num_gs_threads = 0;
977	sq_conf.num_es_threads = 0;
978	sq_conf.num_ps_stack_entries = 128;
979	sq_conf.num_vs_stack_entries = 128;
980	sq_conf.num_gs_stack_entries = 0;
981	sq_conf.num_es_stack_entries = 0;
982	break;
983    }
984
985    r600_sq_setup(pScrn, ib, &sq_conf);
986
987    /* set fake reloc for unused depth */
988    BEGIN_BATCH(3 + 2);
989    EREG(ib, DB_DEPTH_INFO, 0);
990    RELOC_BATCH(accel_state->shaders_bo, RADEON_GEM_DOMAIN_VRAM, 0);
991    END_BATCH();
992
993    BEGIN_BATCH(80);
994    if (info->ChipFamily < CHIP_FAMILY_RV770) {
995	EREG(ib, TA_CNTL_AUX, (( 3 << GRADIENT_CREDIT_shift) |
996			       (28 << TD_FIFO_CREDIT_shift)));
997	EREG(ib, VC_ENHANCE, 0);
998	EREG(ib, R7xx_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, 0);
999	EREG(ib, DB_DEBUG, 0x82000000); /* ? */
1000	EREG(ib, DB_WATERMARKS, ((4 << DEPTH_FREE_shift) |
1001				 (16 << DEPTH_FLUSH_shift) |
1002				 (0 << FORCE_SUMMARIZE_shift) |
1003				 (4 << DEPTH_PENDING_FREE_shift) |
1004				 (16 << DEPTH_CACHELINE_FREE_shift) |
1005				 0));
1006    } else {
1007	EREG(ib, TA_CNTL_AUX, (( 2 << GRADIENT_CREDIT_shift) |
1008			       (28 << TD_FIFO_CREDIT_shift)));
1009	EREG(ib, VC_ENHANCE, 0);
1010	EREG(ib, R7xx_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, VS_PC_LIMIT_ENABLE_bit);
1011	EREG(ib, DB_DEBUG, 0);
1012	EREG(ib, DB_WATERMARKS, ((4 << DEPTH_FREE_shift) |
1013				 (16 << DEPTH_FLUSH_shift) |
1014				 (0 << FORCE_SUMMARIZE_shift) |
1015				 (4 << DEPTH_PENDING_FREE_shift) |
1016				 (4 << DEPTH_CACHELINE_FREE_shift) |
1017				 0));
1018    }
1019
1020    PACK0(ib, SQ_VTX_BASE_VTX_LOC, 2);
1021    E32(ib, 0);
1022    E32(ib, 0);
1023
1024    PACK0(ib, SQ_ESGS_RING_ITEMSIZE, 9);
1025    E32(ib, 0); // SQ_ESGS_RING_ITEMSIZE
1026    E32(ib, 0); // SQ_GSVS_RING_ITEMSIZE
1027    E32(ib, 0); // SQ_ESTMP_RING_ITEMSIZE
1028    E32(ib, 0); // SQ_GSTMP_RING_ITEMSIZE
1029    E32(ib, 0); // SQ_VSTMP_RING_ITEMSIZE
1030    E32(ib, 0); // SQ_PSTMP_RING_ITEMSIZE
1031    E32(ib, 0); // SQ_FBUF_RING_ITEMSIZE
1032    E32(ib, 0); // SQ_REDUC_RING_ITEMSIZE
1033    E32(ib, 0); // SQ_GS_VERT_ITEMSIZE
1034
1035    // DB
1036    EREG(ib, DB_DEPTH_CONTROL,                    0);
1037    PACK0(ib, DB_RENDER_CONTROL, 2);
1038    E32(ib, STENCIL_COMPRESS_DISABLE_bit | DEPTH_COMPRESS_DISABLE_bit);
1039    if (info->ChipFamily < CHIP_FAMILY_RV770)
1040	E32(ib, FORCE_SHADER_Z_ORDER_bit);
1041    else
1042	E32(ib, 0);
1043    EREG(ib, DB_ALPHA_TO_MASK,                    ((2 << ALPHA_TO_MASK_OFFSET0_shift)	|
1044						   (2 << ALPHA_TO_MASK_OFFSET1_shift)	|
1045						   (2 << ALPHA_TO_MASK_OFFSET2_shift)	|
1046						   (2 << ALPHA_TO_MASK_OFFSET3_shift)));
1047    EREG(ib, DB_SHADER_CONTROL, ((1 << Z_ORDER_shift) | /* EARLY_Z_THEN_LATE_Z */
1048				 DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
1049
1050    PACK0(ib, DB_STENCIL_CLEAR, 2);
1051    E32(ib, 0); // DB_STENCIL_CLEAR
1052    E32(ib, 0); // DB_DEPTH_CLEAR
1053
1054    PACK0(ib, DB_STENCILREFMASK, 3);
1055    E32(ib, 0); // DB_STENCILREFMASK
1056    E32(ib, 0); // DB_STENCILREFMASK_BF
1057    E32(ib, 0); // SX_ALPHA_REF
1058
1059    PACK0(ib, CB_CLRCMP_CONTROL, 4);
1060    E32(ib, 1 << CLRCMP_FCN_SEL_shift);				// CB_CLRCMP_CONTROL: use CLRCMP_FCN_SRC
1061    E32(ib, 0);							// CB_CLRCMP_SRC
1062    E32(ib, 0);							// CB_CLRCMP_DST
1063    E32(ib, 0);							// CB_CLRCMP_MSK
1064
1065    EREG(ib, CB_SHADER_MASK,                      OUTPUT0_ENABLE_mask);
1066    EREG(ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
1067
1068    PACK0(ib, SX_ALPHA_TEST_CONTROL, 5);
1069    E32(ib, 0); // SX_ALPHA_TEST_CONTROL
1070    E32(ib, 0x00000000); // CB_BLEND_RED
1071    E32(ib, 0x00000000); // CB_BLEND_GREEN
1072    E32(ib, 0x00000000); // CB_BLEND_BLUE
1073    E32(ib, 0x00000000); // CB_BLEND_ALPHA
1074
1075    EREG(ib, PA_SC_WINDOW_OFFSET,                 ((0 << WINDOW_X_OFFSET_shift) |
1076						   (0 << WINDOW_Y_OFFSET_shift)));
1077
1078    if (info->ChipFamily < CHIP_FAMILY_RV770)
1079	EREG(ib, R7xx_PA_SC_EDGERULE,             0x00000000);
1080    else
1081	EREG(ib, R7xx_PA_SC_EDGERULE,             0xAAAAAAAA);
1082
1083    EREG(ib, PA_SC_CLIPRECT_RULE,                 CLIP_RULE_mask);
1084
1085    END_BATCH();
1086
1087    /* clip boolean is set to always visible -> doesn't matter */
1088    for (i = 0; i < PA_SC_CLIPRECT_0_TL_num; i++)
1089	r600_set_clip_rect(pScrn, ib, i, 0, 0, 8192, 8192);
1090
1091    for (i = 0; i < PA_SC_VPORT_SCISSOR_0_TL_num; i++)
1092	r600_set_vport_scissor(pScrn, ib, i, 0, 0, 8192, 8192);
1093
1094    BEGIN_BATCH(49);
1095    PACK0(ib, PA_SC_MPASS_PS_CNTL, 2);
1096    E32(ib, 0);
1097    if (info->ChipFamily < CHIP_FAMILY_RV770)
1098	E32(ib, (WALK_ORDER_ENABLE_bit | FORCE_EOV_CNTDWN_ENABLE_bit));
1099    else
1100	E32(ib, (FORCE_EOV_CNTDWN_ENABLE_bit | FORCE_EOV_REZ_ENABLE_bit |
1101		 0x00500000)); /* ? */
1102
1103    PACK0(ib, PA_SC_LINE_CNTL, 9);
1104    E32(ib, 0); // PA_SC_LINE_CNTL
1105    E32(ib, 0); // PA_SC_AA_CONFIG
1106    E32(ib, ((2 << PA_SU_VTX_CNTL__ROUND_MODE_shift) | PIX_CENTER_bit | // PA_SU_VTX_CNTL
1107	     (5 << QUANT_MODE_shift))); /* Round to Even, fixed point 1/256 */
1108    EFLOAT(ib, 1.0);						// PA_CL_GB_VERT_CLIP_ADJ
1109    EFLOAT(ib, 1.0);						// PA_CL_GB_VERT_DISC_ADJ
1110    EFLOAT(ib, 1.0);						// PA_CL_GB_HORZ_CLIP_ADJ
1111    EFLOAT(ib, 1.0);						// PA_CL_GB_HORZ_DISC_ADJ
1112    E32(ib, 0);                                                 // PA_SC_AA_SAMPLE_LOCS_MCTX
1113    E32(ib, 0);                                                 // PA_SC_AA_SAMPLE_LOCS_8S_WD1_M
1114
1115    EREG(ib, PA_SC_AA_MASK,                       0xFFFFFFFF);
1116
1117    PACK0(ib, PA_CL_CLIP_CNTL, 5);
1118    E32(ib, CLIP_DISABLE_bit); // PA_CL_CLIP_CNTL
1119    E32(ib, FACE_bit);         // PA_SU_SC_MODE_CNTL
1120    E32(ib, VTX_XY_FMT_bit);   // PA_CL_VTE_CNTL
1121    E32(ib, 0);                // PA_CL_VS_OUT_CNTL
1122    E32(ib, 0);                // PA_CL_NANINF_CNTL
1123
1124    PACK0(ib, PA_SU_POLY_OFFSET_DB_FMT_CNTL, 6);
1125    E32(ib, 0); // PA_SU_POLY_OFFSET_DB_FMT_CNTL
1126    E32(ib, 0); // PA_SU_POLY_OFFSET_CLAMP
1127    E32(ib, 0); // PA_SU_POLY_OFFSET_FRONT_SCALE
1128    E32(ib, 0); // PA_SU_POLY_OFFSET_FRONT_OFFSET
1129    E32(ib, 0); // PA_SU_POLY_OFFSET_BACK_SCALE
1130    E32(ib, 0); // PA_SU_POLY_OFFSET_BACK_OFFSET
1131
1132    // SPI
1133    if (info->ChipFamily < CHIP_FAMILY_RV770)
1134	EREG(ib, R7xx_SPI_THREAD_GROUPING,        0);
1135    else
1136	EREG(ib, R7xx_SPI_THREAD_GROUPING,        (1 << PS_GROUPING_shift));
1137
1138    /* default Interpolator setup */
1139    EREG(ib, SPI_VS_OUT_ID_0, ((0 << SEMANTIC_0_shift) |
1140			       (1 << SEMANTIC_1_shift)));
1141    PACK0(ib, SPI_PS_INPUT_CNTL_0 + (0 << 2), 2);
1142    /* SPI_PS_INPUT_CNTL_0 maps to GPR[0] - load with semantic id 0 */
1143    E32(ib, ((0    << SEMANTIC_shift)	|
1144	     (0x01 << DEFAULT_VAL_shift)	|
1145	     SEL_CENTROID_bit));
1146    /* SPI_PS_INPUT_CNTL_1 maps to GPR[1] - load with semantic id 1 */
1147    E32(ib, ((1    << SEMANTIC_shift)	|
1148	     (0x01 << DEFAULT_VAL_shift)	|
1149	     SEL_CENTROID_bit));
1150
1151    PACK0(ib, SPI_INPUT_Z, 4);
1152    E32(ib, 0); // SPI_INPUT_Z
1153    E32(ib, 0); // SPI_FOG_CNTL
1154    E32(ib, 0); // SPI_FOG_FUNC_SCALE
1155    E32(ib, 0); // SPI_FOG_FUNC_BIAS
1156
1157    END_BATCH();
1158
1159    // clear FS
1160    fs_conf.bo = accel_state->shaders_bo;
1161    r600_fs_setup(pScrn, ib, &fs_conf, RADEON_GEM_DOMAIN_VRAM);
1162
1163    // VGT
1164    BEGIN_BATCH(46);
1165    PACK0(ib, VGT_MAX_VTX_INDX, 4);
1166    E32(ib, 0xffffff); // VGT_MAX_VTX_INDX
1167    E32(ib, 0); // VGT_MIN_VTX_INDX
1168    E32(ib, 0); // VGT_INDX_OFFSET
1169    E32(ib, 0); // VGT_MULTI_PRIM_IB_RESET_INDX
1170
1171    EREG(ib, VGT_PRIMITIVEID_EN,                  0);
1172    EREG(ib, VGT_MULTI_PRIM_IB_RESET_EN,          0);
1173
1174    PACK0(ib, VGT_INSTANCE_STEP_RATE_0, 2);
1175    E32(ib, 0); // VGT_INSTANCE_STEP_RATE_0
1176    E32(ib, 0); // VGT_INSTANCE_STEP_RATE_1
1177
1178    PACK0(ib, PA_SU_POINT_SIZE, 17);
1179    E32(ib, 0); // PA_SU_POINT_SIZE
1180    E32(ib, 0); // PA_SU_POINT_MINMAX
1181    E32(ib, (8 << PA_SU_LINE_CNTL__WIDTH_shift)); /* Line width 1 pixel */ // PA_SU_LINE_CNTL
1182    E32(ib, 0); // PA_SC_LINE_STIPPLE
1183    E32(ib, 0); // VGT_OUTPUT_PATH_CNTL
1184    E32(ib, 0); // VGT_HOS_CNTL
1185    E32(ib, 0); // VGT_HOS_MAX_TESS_LEVEL
1186    E32(ib, 0); // VGT_HOS_MIN_TESS_LEVEL
1187    E32(ib, 0); // VGT_HOS_REUSE_DEPTH
1188    E32(ib, 0); // VGT_GROUP_PRIM_TYPE
1189    E32(ib, 0); // VGT_GROUP_FIRST_DECR
1190    E32(ib, 0); // VGT_GROUP_DECR
1191    E32(ib, 0); // VGT_GROUP_VECT_0_CNTL
1192    E32(ib, 0); // VGT_GROUP_VECT_1_CNTL
1193    E32(ib, 0); // VGT_GROUP_VECT_0_FMT_CNTL
1194    E32(ib, 0); // VGT_GROUP_VECT_1_FMT_CNTL
1195    E32(ib, 0); // VGT_GS_MODE
1196
1197    PACK0(ib, VGT_STRMOUT_EN, 3);
1198    E32(ib, 0); // VGT_STRMOUT_EN
1199    E32(ib, 0); // VGT_REUSE_OFF
1200    E32(ib, 0); // VGT_VTX_CNT_EN
1201
1202    EREG(ib, VGT_STRMOUT_BUFFER_EN,               0);
1203    EREG(ib, SX_MISC,                             0);
1204    END_BATCH();
1205}
1206
1207
1208/*
1209 * Commands
1210 */
1211
1212void
1213r600_draw_immd(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf, uint32_t *indices)
1214{
1215    RADEONInfoPtr info = RADEONPTR(pScrn);
1216    uint32_t i, count;
1217
1218    // calculate num of packets
1219    count = 2;
1220    if (draw_conf->index_type == DI_INDEX_SIZE_16_BIT)
1221	count += (draw_conf->num_indices + 1) / 2;
1222    else
1223	count += draw_conf->num_indices;
1224
1225    BEGIN_BATCH(8 + count);
1226    EREG(ib, VGT_PRIMITIVE_TYPE, draw_conf->prim_type);
1227    PACK3(ib, IT_INDEX_TYPE, 1);
1228#if X_BYTE_ORDER == X_BIG_ENDIAN
1229    E32(ib, IT_INDEX_TYPE_SWAP_MODE(ENDIAN_8IN32) | draw_conf->index_type);
1230#else
1231    E32(ib, draw_conf->index_type);
1232#endif
1233    PACK3(ib, IT_NUM_INSTANCES, 1);
1234    E32(ib, draw_conf->num_instances);
1235
1236    PACK3(ib, IT_DRAW_INDEX_IMMD, count);
1237    E32(ib, draw_conf->num_indices);
1238    E32(ib, draw_conf->vgt_draw_initiator);
1239
1240    if (draw_conf->index_type == DI_INDEX_SIZE_16_BIT) {
1241	for (i = 0; i < draw_conf->num_indices; i += 2) {
1242	    if ((i + 1) == draw_conf->num_indices)
1243		E32(ib, indices[i]);
1244	    else
1245		E32(ib, (indices[i] | (indices[i + 1] << 16)));
1246	}
1247    } else {
1248	for (i = 0; i < draw_conf->num_indices; i++)
1249	    E32(ib, indices[i]);
1250    }
1251    END_BATCH();
1252}
1253
1254void
1255r600_draw_auto(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf)
1256{
1257    RADEONInfoPtr info = RADEONPTR(pScrn);
1258
1259    BEGIN_BATCH(10);
1260    EREG(ib, VGT_PRIMITIVE_TYPE, draw_conf->prim_type);
1261    PACK3(ib, IT_INDEX_TYPE, 1);
1262#if X_BYTE_ORDER == X_BIG_ENDIAN
1263    E32(ib, IT_INDEX_TYPE_SWAP_MODE(ENDIAN_8IN32) | draw_conf->index_type);
1264#else
1265    E32(ib, draw_conf->index_type);
1266#endif
1267    PACK3(ib, IT_NUM_INSTANCES, 1);
1268    E32(ib, draw_conf->num_instances);
1269    PACK3(ib, IT_DRAW_INDEX_AUTO, 2);
1270    E32(ib, draw_conf->num_indices);
1271    E32(ib, draw_conf->vgt_draw_initiator);
1272    END_BATCH();
1273}
1274
1275void r600_finish_op(ScrnInfoPtr pScrn, int vtx_size)
1276{
1277    RADEONInfoPtr info = RADEONPTR(pScrn);
1278    struct radeon_accel_state *accel_state = info->accel_state;
1279    draw_config_t   draw_conf;
1280    vtx_resource_t  vtx_res;
1281
1282    if (accel_state->vbo.vb_start_op == -1)
1283	return;
1284
1285    CLEAR (draw_conf);
1286    CLEAR (vtx_res);
1287
1288    if (accel_state->vbo.vb_offset == accel_state->vbo.vb_start_op) {
1289        R600IBDiscard(pScrn, accel_state->ib);
1290	return;
1291    }
1292
1293    /* Vertex buffer setup */
1294    accel_state->vbo.vb_size = accel_state->vbo.vb_offset - accel_state->vbo.vb_start_op;
1295    vtx_res.id              = SQ_VTX_RESOURCE_vs;
1296    vtx_res.vtx_size_dw     = vtx_size / 4;
1297    vtx_res.vtx_num_entries = accel_state->vbo.vb_size / 4;
1298    vtx_res.mem_req_size    = 1;
1299    vtx_res.vb_addr         = accel_state->vbo.vb_mc_addr + accel_state->vbo.vb_start_op;
1300    vtx_res.bo              = accel_state->vbo.vb_bo;
1301#if X_BYTE_ORDER == X_BIG_ENDIAN
1302    vtx_res.endian          = SQ_ENDIAN_8IN32;
1303#endif
1304    r600_set_vtx_resource(pScrn, accel_state->ib, &vtx_res, RADEON_GEM_DOMAIN_GTT);
1305
1306    /* Draw */
1307    draw_conf.prim_type          = DI_PT_RECTLIST;
1308    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
1309    draw_conf.num_instances      = 1;
1310    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
1311    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
1312
1313    r600_draw_auto(pScrn, accel_state->ib, &draw_conf);
1314
1315    /* XXX drm should handle this in fence submit */
1316    r600_wait_3d_idle_clean(pScrn, accel_state->ib);
1317
1318    /* sync dst surface */
1319    r600_cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
1320			     accel_state->dst_size, accel_state->dst_obj.offset,
1321			     accel_state->dst_obj.bo, 0, accel_state->dst_obj.domain);
1322
1323    accel_state->vbo.vb_start_op = -1;
1324    accel_state->ib_reset_op = 0;
1325
1326#if KMS_MULTI_OP
1327    if (!info->cs)
1328#endif
1329	R600CPFlushIndirect(pScrn, accel_state->ib);
1330}
1331
1332