1848b8605Smrg/*
2848b8605Smrg * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3b8e80941Smrg * Copyright 2018 Advanced Micro Devices, Inc.
4b8e80941Smrg * All Rights Reserved.
5848b8605Smrg *
6848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
7848b8605Smrg * copy of this software and associated documentation files (the "Software"),
8848b8605Smrg * to deal in the Software without restriction, including without limitation
9848b8605Smrg * on the rights to use, copy, modify, merge, publish, distribute, sub
10848b8605Smrg * license, and/or sell copies of the Software, and to permit persons to whom
11848b8605Smrg * the Software is furnished to do so, subject to the following conditions:
12848b8605Smrg *
13848b8605Smrg * The above copyright notice and this permission notice (including the next
14848b8605Smrg * paragraph) shall be included in all copies or substantial portions of the
15848b8605Smrg * Software.
16848b8605Smrg *
17848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18848b8605Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
20848b8605Smrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21848b8605Smrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22848b8605Smrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23848b8605Smrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
24848b8605Smrg */
25848b8605Smrg
26848b8605Smrg#include "si_pipe.h"
27848b8605Smrg#include "si_public.h"
28b8e80941Smrg#include "si_shader_internal.h"
29b8e80941Smrg#include "si_compute.h"
30848b8605Smrg#include "sid.h"
31848b8605Smrg
32b8e80941Smrg#include "ac_llvm_util.h"
33848b8605Smrg#include "radeon/radeon_uvd.h"
34b8e80941Smrg#include "gallivm/lp_bld_misc.h"
35b8e80941Smrg#include "util/disk_cache.h"
36b8e80941Smrg#include "util/u_log.h"
37848b8605Smrg#include "util/u_memory.h"
38b8e80941Smrg#include "util/u_suballoc.h"
39b8e80941Smrg#include "util/u_tests.h"
40b8e80941Smrg#include "util/u_upload_mgr.h"
41b8e80941Smrg#include "util/xmlconfig.h"
42848b8605Smrg#include "vl/vl_decoder.h"
43b8e80941Smrg#include "driver_ddebug/dd_util.h"
44b8e80941Smrg
45b8e80941Smrgstatic const struct debug_named_value debug_options[] = {
46b8e80941Smrg	/* Shader logging options: */
47b8e80941Smrg	{ "vs", DBG(VS), "Print vertex shaders" },
48b8e80941Smrg	{ "ps", DBG(PS), "Print pixel shaders" },
49b8e80941Smrg	{ "gs", DBG(GS), "Print geometry shaders" },
50b8e80941Smrg	{ "tcs", DBG(TCS), "Print tessellation control shaders" },
51b8e80941Smrg	{ "tes", DBG(TES), "Print tessellation evaluation shaders" },
52b8e80941Smrg	{ "cs", DBG(CS), "Print compute shaders" },
53b8e80941Smrg	{ "noir", DBG(NO_IR), "Don't print the LLVM IR"},
54b8e80941Smrg	{ "notgsi", DBG(NO_TGSI), "Don't print the TGSI"},
55b8e80941Smrg	{ "noasm", DBG(NO_ASM), "Don't print disassembled shaders"},
56b8e80941Smrg	{ "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" },
57b8e80941Smrg
58b8e80941Smrg	/* Shader compiler options the shader cache should be aware of: */
59b8e80941Smrg	{ "unsafemath", DBG(UNSAFE_MATH), "Enable unsafe math shader optimizations" },
60b8e80941Smrg	{ "sisched", DBG(SI_SCHED), "Enable LLVM SI Machine Instruction Scheduler." },
61b8e80941Smrg	{ "gisel", DBG(GISEL), "Enable LLVM global instruction selector." },
62b8e80941Smrg
63b8e80941Smrg	/* Shader compiler options (with no effect on the shader cache): */
64b8e80941Smrg	{ "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
65b8e80941Smrg	{ "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" },
66b8e80941Smrg	{ "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." },
67b8e80941Smrg
68b8e80941Smrg	/* Information logging options: */
69b8e80941Smrg	{ "info", DBG(INFO), "Print driver information" },
70b8e80941Smrg	{ "tex", DBG(TEX), "Print texture info" },
71b8e80941Smrg	{ "compute", DBG(COMPUTE), "Print compute info" },
72b8e80941Smrg	{ "vm", DBG(VM), "Print virtual addresses when creating resources" },
73b8e80941Smrg
74b8e80941Smrg	/* Driver options: */
75b8e80941Smrg	{ "forcedma", DBG(FORCE_DMA), "Use asynchronous DMA for all operations when possible." },
76b8e80941Smrg	{ "nodma", DBG(NO_ASYNC_DMA), "Disable asynchronous DMA" },
77b8e80941Smrg	{ "nowc", DBG(NO_WC), "Disable GTT write combining" },
78b8e80941Smrg	{ "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." },
79b8e80941Smrg	{ "reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context." },
80b8e80941Smrg	{ "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },
81b8e80941Smrg
82b8e80941Smrg	/* 3D engine options: */
83b8e80941Smrg	{ "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
84b8e80941Smrg	{ "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
85b8e80941Smrg	{ "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
86b8e80941Smrg	{ "nodfsm", DBG(NO_DFSM), "Disable DFSM." },
87b8e80941Smrg	{ "dpbb", DBG(DPBB), "Enable DPBB." },
88b8e80941Smrg	{ "dfsm", DBG(DFSM), "Enable DFSM." },
89b8e80941Smrg	{ "nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z" },
90b8e80941Smrg	{ "norbplus", DBG(NO_RB_PLUS), "Disable RB+." },
91b8e80941Smrg	{ "no2d", DBG(NO_2D_TILING), "Disable 2D tiling" },
92b8e80941Smrg	{ "notiling", DBG(NO_TILING), "Disable tiling" },
93b8e80941Smrg	{ "nodcc", DBG(NO_DCC), "Disable DCC." },
94b8e80941Smrg	{ "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." },
95b8e80941Smrg	{ "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" },
96b8e80941Smrg	{ "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" },
97b8e80941Smrg	{ "nofmask", DBG(NO_FMASK), "Disable MSAA compression" },
98b8e80941Smrg
99b8e80941Smrg	/* Tests: */
100b8e80941Smrg	{ "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." },
101b8e80941Smrg	{ "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." },
102b8e80941Smrg	{ "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." },
103b8e80941Smrg	{ "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." },
104b8e80941Smrg	{ "testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance" },
105b8e80941Smrg	{ "testgds", DBG(TEST_GDS), "Test GDS." },
106b8e80941Smrg	{ "testgdsmm", DBG(TEST_GDS_MM), "Test GDS memory management." },
107b8e80941Smrg	{ "testgdsoamm", DBG(TEST_GDS_OA_MM), "Test GDS OA memory management." },
108b8e80941Smrg
109b8e80941Smrg	DEBUG_NAMED_VALUE_END /* must be last */
110b8e80941Smrg};
111b8e80941Smrg
112b8e80941Smrgstatic void si_init_compiler(struct si_screen *sscreen,
113b8e80941Smrg			     struct ac_llvm_compiler *compiler)
114b8e80941Smrg{
115b8e80941Smrg	/* Only create the less-optimizing version of the compiler on APUs
116b8e80941Smrg	 * predating Ryzen (Raven). */
117b8e80941Smrg	bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram &&
118b8e80941Smrg				       sscreen->info.chip_class <= VI;
119b8e80941Smrg
120b8e80941Smrg	enum ac_target_machine_options tm_options =
121b8e80941Smrg		(sscreen->debug_flags & DBG(SI_SCHED) ? AC_TM_SISCHED : 0) |
122b8e80941Smrg		(sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
123ac028361Smrg		(sscreen->info.chip_class >= VI ? AC_TM_FORCE_ENABLE_XNACK : 0) |
124ac028361Smrg		(sscreen->info.chip_class < VI ? AC_TM_FORCE_DISABLE_XNACK : 0) |
125b8e80941Smrg		(!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
126b8e80941Smrg		(sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
127b8e80941Smrg		(create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
128b8e80941Smrg
129b8e80941Smrg	ac_init_llvm_once();
130b8e80941Smrg	ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
131b8e80941Smrg	compiler->passes = ac_create_llvm_passes(compiler->tm);
132b8e80941Smrg
133b8e80941Smrg	if (compiler->low_opt_tm)
134b8e80941Smrg		compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
135b8e80941Smrg}
136b8e80941Smrg
137b8e80941Smrgstatic void si_destroy_compiler(struct ac_llvm_compiler *compiler)
138b8e80941Smrg{
139b8e80941Smrg	ac_destroy_llvm_passes(compiler->passes);
140b8e80941Smrg	ac_destroy_llvm_passes(compiler->low_opt_passes);
141b8e80941Smrg	ac_destroy_llvm_compiler(compiler);
142b8e80941Smrg}
143848b8605Smrg
144848b8605Smrg/*
145848b8605Smrg * pipe_context
146848b8605Smrg */
147848b8605Smrgstatic void si_destroy_context(struct pipe_context *context)
148848b8605Smrg{
149848b8605Smrg	struct si_context *sctx = (struct si_context *)context;
150b8e80941Smrg	int i;
151b8e80941Smrg
152b8e80941Smrg	util_queue_finish(&sctx->screen->shader_compiler_queue);
153b8e80941Smrg	util_queue_finish(&sctx->screen->shader_compiler_queue_low_priority);
154b8e80941Smrg
155b8e80941Smrg	/* Unreference the framebuffer normally to disable related logic
156b8e80941Smrg	 * properly.
157b8e80941Smrg	 */
158b8e80941Smrg	struct pipe_framebuffer_state fb = {};
159b8e80941Smrg	if (context->set_framebuffer_state)
160b8e80941Smrg		context->set_framebuffer_state(context, &fb);
161848b8605Smrg
162848b8605Smrg	si_release_all_descriptors(sctx);
163848b8605Smrg
164b8e80941Smrg	pipe_resource_reference(&sctx->esgs_ring, NULL);
165b8e80941Smrg	pipe_resource_reference(&sctx->gsvs_ring, NULL);
166b8e80941Smrg	pipe_resource_reference(&sctx->tess_rings, NULL);
167848b8605Smrg	pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
168b8e80941Smrg	pipe_resource_reference(&sctx->sample_pos_buffer, NULL);
169b8e80941Smrg	si_resource_reference(&sctx->border_color_buffer, NULL);
170b8e80941Smrg	free(sctx->border_color_table);
171b8e80941Smrg	si_resource_reference(&sctx->scratch_buffer, NULL);
172b8e80941Smrg	si_resource_reference(&sctx->compute_scratch_buffer, NULL);
173b8e80941Smrg	si_resource_reference(&sctx->wait_mem_scratch, NULL);
174b8e80941Smrg
175b8e80941Smrg	si_pm4_free_state(sctx, sctx->init_config, ~0);
176b8e80941Smrg	if (sctx->init_config_gs_rings)
177b8e80941Smrg		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
178b8e80941Smrg	for (i = 0; i < ARRAY_SIZE(sctx->vgt_shader_config); i++)
179b8e80941Smrg		si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
180b8e80941Smrg
181b8e80941Smrg	if (sctx->fixed_func_tcs_shader.cso)
182b8e80941Smrg		sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso);
183b8e80941Smrg	if (sctx->custom_dsa_flush)
184b8e80941Smrg		sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush);
185b8e80941Smrg	if (sctx->custom_blend_resolve)
186b8e80941Smrg		sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve);
187b8e80941Smrg	if (sctx->custom_blend_fmask_decompress)
188b8e80941Smrg		sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress);
189b8e80941Smrg	if (sctx->custom_blend_eliminate_fastclear)
190b8e80941Smrg		sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear);
191b8e80941Smrg	if (sctx->custom_blend_dcc_decompress)
192b8e80941Smrg		sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress);
193b8e80941Smrg	if (sctx->vs_blit_pos)
194b8e80941Smrg		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos);
195b8e80941Smrg	if (sctx->vs_blit_pos_layered)
196b8e80941Smrg		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered);
197b8e80941Smrg	if (sctx->vs_blit_color)
198b8e80941Smrg		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color);
199b8e80941Smrg	if (sctx->vs_blit_color_layered)
200b8e80941Smrg		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
201b8e80941Smrg	if (sctx->vs_blit_texcoord)
202b8e80941Smrg		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
203b8e80941Smrg	if (sctx->cs_clear_buffer)
204b8e80941Smrg		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
205b8e80941Smrg	if (sctx->cs_copy_buffer)
206b8e80941Smrg		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
207b8e80941Smrg	if (sctx->cs_copy_image)
208b8e80941Smrg		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
209b8e80941Smrg	if (sctx->cs_copy_image_1d_array)
210b8e80941Smrg		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
211b8e80941Smrg	if (sctx->cs_clear_render_target)
212b8e80941Smrg		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target);
213b8e80941Smrg	if (sctx->cs_clear_render_target_1d_array)
214b8e80941Smrg		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array);
215b8e80941Smrg	if (sctx->cs_dcc_retile)
216b8e80941Smrg		sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
217b8e80941Smrg
218b8e80941Smrg	if (sctx->blitter)
219b8e80941Smrg		util_blitter_destroy(sctx->blitter);
220b8e80941Smrg
221b8e80941Smrg	/* Release DCC stats. */
222b8e80941Smrg	for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
223b8e80941Smrg		assert(!sctx->dcc_stats[i].query_active);
224b8e80941Smrg
225b8e80941Smrg		for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
226b8e80941Smrg			if (sctx->dcc_stats[i].ps_stats[j])
227b8e80941Smrg				sctx->b.destroy_query(&sctx->b,
228b8e80941Smrg							sctx->dcc_stats[i].ps_stats[j]);
229b8e80941Smrg
230b8e80941Smrg		si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
231b8e80941Smrg	}
232848b8605Smrg
233b8e80941Smrg	if (sctx->query_result_shader)
234b8e80941Smrg		sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader);
235848b8605Smrg
236b8e80941Smrg	if (sctx->gfx_cs)
237b8e80941Smrg		sctx->ws->cs_destroy(sctx->gfx_cs);
238b8e80941Smrg	if (sctx->dma_cs)
239b8e80941Smrg		sctx->ws->cs_destroy(sctx->dma_cs);
240b8e80941Smrg	if (sctx->ctx)
241b8e80941Smrg		sctx->ws->ctx_destroy(sctx->ctx);
242b8e80941Smrg
243b8e80941Smrg	if (sctx->b.stream_uploader)
244b8e80941Smrg		u_upload_destroy(sctx->b.stream_uploader);
245b8e80941Smrg	if (sctx->b.const_uploader)
246b8e80941Smrg		u_upload_destroy(sctx->b.const_uploader);
247b8e80941Smrg	if (sctx->cached_gtt_allocator)
248b8e80941Smrg		u_upload_destroy(sctx->cached_gtt_allocator);
249b8e80941Smrg
250b8e80941Smrg	slab_destroy_child(&sctx->pool_transfers);
251b8e80941Smrg	slab_destroy_child(&sctx->pool_transfers_unsync);
252b8e80941Smrg
253b8e80941Smrg	if (sctx->allocator_zeroed_memory)
254b8e80941Smrg		u_suballocator_destroy(sctx->allocator_zeroed_memory);
255b8e80941Smrg
256b8e80941Smrg	sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
257b8e80941Smrg	sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
258b8e80941Smrg	si_resource_reference(&sctx->eop_bug_scratch, NULL);
259848b8605Smrg
260b8e80941Smrg	si_destroy_compiler(&sctx->compiler);
261848b8605Smrg
262b8e80941Smrg	si_saved_cs_reference(&sctx->current_saved_cs, NULL);
263848b8605Smrg
264b8e80941Smrg	_mesa_hash_table_destroy(sctx->tex_handles, NULL);
265b8e80941Smrg	_mesa_hash_table_destroy(sctx->img_handles, NULL);
266b8e80941Smrg
267b8e80941Smrg	util_dynarray_fini(&sctx->resident_tex_handles);
268b8e80941Smrg	util_dynarray_fini(&sctx->resident_img_handles);
269b8e80941Smrg	util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
270b8e80941Smrg	util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
271b8e80941Smrg	util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
272b8e80941Smrg	si_unref_sdma_uploads(sctx);
273848b8605Smrg	FREE(sctx);
274848b8605Smrg}
275848b8605Smrg
276b8e80941Smrgstatic enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx)
277b8e80941Smrg{
278b8e80941Smrg	struct si_context *sctx = (struct si_context *)ctx;
279b8e80941Smrg
280b8e80941Smrg	if (sctx->screen->info.has_gpu_reset_status_query)
281b8e80941Smrg		return sctx->ws->ctx_query_reset_status(sctx->ctx);
282b8e80941Smrg
283b8e80941Smrg	if (sctx->screen->info.has_gpu_reset_counter_query) {
284b8e80941Smrg		unsigned latest = sctx->ws->query_value(sctx->ws,
285b8e80941Smrg							RADEON_GPU_RESET_COUNTER);
286b8e80941Smrg
287b8e80941Smrg		if (sctx->gpu_reset_counter == latest)
288b8e80941Smrg			return PIPE_NO_RESET;
289b8e80941Smrg
290b8e80941Smrg		sctx->gpu_reset_counter = latest;
291b8e80941Smrg		return PIPE_UNKNOWN_CONTEXT_RESET;
292b8e80941Smrg	}
293b8e80941Smrg
294b8e80941Smrg	return PIPE_NO_RESET;
295b8e80941Smrg}
296b8e80941Smrg
297b8e80941Smrgstatic void si_set_device_reset_callback(struct pipe_context *ctx,
298b8e80941Smrg					   const struct pipe_device_reset_callback *cb)
299b8e80941Smrg{
300b8e80941Smrg	struct si_context *sctx = (struct si_context *)ctx;
301b8e80941Smrg
302b8e80941Smrg	if (cb)
303b8e80941Smrg		sctx->device_reset_callback = *cb;
304b8e80941Smrg	else
305b8e80941Smrg		memset(&sctx->device_reset_callback, 0,
306b8e80941Smrg		       sizeof(sctx->device_reset_callback));
307b8e80941Smrg}
308b8e80941Smrg
309b8e80941Smrgbool si_check_device_reset(struct si_context *sctx)
310b8e80941Smrg{
311b8e80941Smrg	enum pipe_reset_status status;
312b8e80941Smrg
313b8e80941Smrg	if (!sctx->device_reset_callback.reset)
314b8e80941Smrg		return false;
315b8e80941Smrg
316b8e80941Smrg	if (!sctx->b.get_device_reset_status)
317b8e80941Smrg		return false;
318b8e80941Smrg
319b8e80941Smrg	status = sctx->b.get_device_reset_status(&sctx->b);
320b8e80941Smrg	if (status == PIPE_NO_RESET)
321b8e80941Smrg		return false;
322b8e80941Smrg
323b8e80941Smrg	sctx->device_reset_callback.reset(sctx->device_reset_callback.data, status);
324b8e80941Smrg	return true;
325b8e80941Smrg}
326b8e80941Smrg
327b8e80941Smrg/* Apitrace profiling:
328b8e80941Smrg *   1) qapitrace : Tools -> Profile: Measure CPU & GPU times
329b8e80941Smrg *   2) In the middle panel, zoom in (mouse wheel) on some bad draw call
330b8e80941Smrg *      and remember its number.
331b8e80941Smrg *   3) In Mesa, enable queries and performance counters around that draw
332b8e80941Smrg *      call and print the results.
333b8e80941Smrg *   4) glretrace --benchmark --markers ..
334b8e80941Smrg */
335b8e80941Smrgstatic void si_emit_string_marker(struct pipe_context *ctx,
336b8e80941Smrg				  const char *string, int len)
337b8e80941Smrg{
338b8e80941Smrg	struct si_context *sctx = (struct si_context *)ctx;
339b8e80941Smrg
340b8e80941Smrg	dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
341b8e80941Smrg
342b8e80941Smrg	if (sctx->log)
343b8e80941Smrg		u_log_printf(sctx->log, "\nString marker: %*s\n", len, string);
344b8e80941Smrg}
345b8e80941Smrg
346b8e80941Smrgstatic void si_set_debug_callback(struct pipe_context *ctx,
347b8e80941Smrg				  const struct pipe_debug_callback *cb)
348b8e80941Smrg{
349b8e80941Smrg	struct si_context *sctx = (struct si_context *)ctx;
350b8e80941Smrg	struct si_screen *screen = sctx->screen;
351b8e80941Smrg
352b8e80941Smrg	util_queue_finish(&screen->shader_compiler_queue);
353b8e80941Smrg	util_queue_finish(&screen->shader_compiler_queue_low_priority);
354b8e80941Smrg
355b8e80941Smrg	if (cb)
356b8e80941Smrg		sctx->debug = *cb;
357b8e80941Smrg	else
358b8e80941Smrg		memset(&sctx->debug, 0, sizeof(sctx->debug));
359b8e80941Smrg}
360b8e80941Smrg
361b8e80941Smrgstatic void si_set_log_context(struct pipe_context *ctx,
362b8e80941Smrg			       struct u_log_context *log)
363b8e80941Smrg{
364b8e80941Smrg	struct si_context *sctx = (struct si_context *)ctx;
365b8e80941Smrg	sctx->log = log;
366b8e80941Smrg
367b8e80941Smrg	if (log)
368b8e80941Smrg		u_log_add_auto_logger(log, si_auto_log_cs, sctx);
369b8e80941Smrg}
370b8e80941Smrg
371b8e80941Smrgstatic void si_set_context_param(struct pipe_context *ctx,
372b8e80941Smrg				 enum pipe_context_param param,
373b8e80941Smrg				 unsigned value)
374b8e80941Smrg{
375b8e80941Smrg	struct radeon_winsys *ws = ((struct si_context *)ctx)->ws;
376b8e80941Smrg
377b8e80941Smrg	switch (param) {
378b8e80941Smrg	case PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE:
379b8e80941Smrg		ws->pin_threads_to_L3_cache(ws, value);
380b8e80941Smrg		break;
381b8e80941Smrg	default:;
382b8e80941Smrg	}
383b8e80941Smrg}
384b8e80941Smrg
385b8e80941Smrgstatic struct pipe_context *si_create_context(struct pipe_screen *screen,
386b8e80941Smrg                                              unsigned flags)
387848b8605Smrg{
388848b8605Smrg	struct si_context *sctx = CALLOC_STRUCT(si_context);
389848b8605Smrg	struct si_screen* sscreen = (struct si_screen *)screen;
390b8e80941Smrg	struct radeon_winsys *ws = sscreen->ws;
391848b8605Smrg	int shader, i;
392b8e80941Smrg	bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
393848b8605Smrg
394b8e80941Smrg	if (!sctx)
395848b8605Smrg		return NULL;
396848b8605Smrg
397b8e80941Smrg	sctx->has_graphics = sscreen->info.chip_class == SI ||
398b8e80941Smrg			     !(flags & PIPE_CONTEXT_COMPUTE_ONLY);
399b8e80941Smrg
400b8e80941Smrg	if (flags & PIPE_CONTEXT_DEBUG)
401b8e80941Smrg		sscreen->record_llvm_ir = true; /* racy but not critical */
402b8e80941Smrg
403b8e80941Smrg	sctx->b.screen = screen; /* this must be set first */
404b8e80941Smrg	sctx->b.priv = NULL;
405b8e80941Smrg	sctx->b.destroy = si_destroy_context;
406848b8605Smrg	sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
407b8e80941Smrg	sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
408b8e80941Smrg
409b8e80941Smrg	slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
410b8e80941Smrg	slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers);
411b8e80941Smrg
412b8e80941Smrg	sctx->ws = sscreen->ws;
413b8e80941Smrg	sctx->family = sscreen->info.family;
414b8e80941Smrg	sctx->chip_class = sscreen->info.chip_class;
415b8e80941Smrg
416b8e80941Smrg	if (sscreen->info.has_gpu_reset_counter_query) {
417b8e80941Smrg		sctx->gpu_reset_counter =
418b8e80941Smrg			sctx->ws->query_value(sctx->ws, RADEON_GPU_RESET_COUNTER);
419b8e80941Smrg	}
420b8e80941Smrg
421b8e80941Smrg
422b8e80941Smrg	if (sctx->chip_class == CIK ||
423b8e80941Smrg	    sctx->chip_class == VI ||
424b8e80941Smrg	    sctx->chip_class == GFX9) {
425b8e80941Smrg		sctx->eop_bug_scratch = si_resource(
426b8e80941Smrg			pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
427b8e80941Smrg					   16 * sscreen->info.num_render_backends));
428b8e80941Smrg		if (!sctx->eop_bug_scratch)
429b8e80941Smrg			goto fail;
430b8e80941Smrg	}
431b8e80941Smrg
432b8e80941Smrg	/* Initialize context allocators. */
433b8e80941Smrg	sctx->allocator_zeroed_memory =
434b8e80941Smrg		u_suballocator_create(&sctx->b, 128 * 1024,
435b8e80941Smrg				      0, PIPE_USAGE_DEFAULT,
436b8e80941Smrg				      SI_RESOURCE_FLAG_UNMAPPABLE |
437b8e80941Smrg				      SI_RESOURCE_FLAG_CLEAR, false);
438b8e80941Smrg	if (!sctx->allocator_zeroed_memory)
439b8e80941Smrg		goto fail;
440b8e80941Smrg
441b8e80941Smrg	sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024,
442b8e80941Smrg						    0, PIPE_USAGE_STREAM,
443b8e80941Smrg						    SI_RESOURCE_FLAG_READ_ONLY);
444b8e80941Smrg	if (!sctx->b.stream_uploader)
445b8e80941Smrg		goto fail;
446b8e80941Smrg
447b8e80941Smrg	sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
448b8e80941Smrg						       0, PIPE_USAGE_STAGING, 0);
449b8e80941Smrg	if (!sctx->cached_gtt_allocator)
450b8e80941Smrg		goto fail;
451b8e80941Smrg
452b8e80941Smrg	sctx->ctx = sctx->ws->ctx_create(sctx->ws);
453b8e80941Smrg	if (!sctx->ctx)
454b8e80941Smrg		goto fail;
455b8e80941Smrg
456b8e80941Smrg	if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
457b8e80941Smrg		sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA,
458b8e80941Smrg						   (void*)si_flush_dma_cs,
459b8e80941Smrg						   sctx, stop_exec_on_failure);
460b8e80941Smrg	}
461b8e80941Smrg
462b8e80941Smrg	bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->dma_cs;
463b8e80941Smrg	sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
464b8e80941Smrg						 0, PIPE_USAGE_DEFAULT,
465b8e80941Smrg						 SI_RESOURCE_FLAG_32BIT |
466b8e80941Smrg						 (use_sdma_upload ?
467b8e80941Smrg							  SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
468b8e80941Smrg	if (!sctx->b.const_uploader)
469b8e80941Smrg		goto fail;
470b8e80941Smrg
471b8e80941Smrg	if (use_sdma_upload)
472b8e80941Smrg		u_upload_enable_flush_explicit(sctx->b.const_uploader);
473b8e80941Smrg
474b8e80941Smrg	sctx->gfx_cs = ws->cs_create(sctx->ctx,
475b8e80941Smrg				     sctx->has_graphics ? RING_GFX : RING_COMPUTE,
476b8e80941Smrg				     (void*)si_flush_gfx_cs, sctx, stop_exec_on_failure);
477b8e80941Smrg
478b8e80941Smrg	/* Border colors. */
479b8e80941Smrg	sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS *
480b8e80941Smrg					  sizeof(*sctx->border_color_table));
481b8e80941Smrg	if (!sctx->border_color_table)
482b8e80941Smrg		goto fail;
483b8e80941Smrg
484b8e80941Smrg	sctx->border_color_buffer = si_resource(
485b8e80941Smrg		pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT,
486b8e80941Smrg				   SI_MAX_BORDER_COLORS *
487b8e80941Smrg				   sizeof(*sctx->border_color_table)));
488b8e80941Smrg	if (!sctx->border_color_buffer)
489b8e80941Smrg		goto fail;
490848b8605Smrg
491b8e80941Smrg	sctx->border_color_map =
492b8e80941Smrg		ws->buffer_map(sctx->border_color_buffer->buf,
493b8e80941Smrg			       NULL, PIPE_TRANSFER_WRITE);
494b8e80941Smrg	if (!sctx->border_color_map)
495848b8605Smrg		goto fail;
496848b8605Smrg
497b8e80941Smrg	/* Initialize context functions used by graphics and compute. */
498b8e80941Smrg	sctx->b.emit_string_marker = si_emit_string_marker;
499b8e80941Smrg	sctx->b.set_debug_callback = si_set_debug_callback;
500b8e80941Smrg	sctx->b.set_log_context = si_set_log_context;
501b8e80941Smrg	sctx->b.set_context_param = si_set_context_param;
502b8e80941Smrg	sctx->b.get_device_reset_status = si_get_reset_status;
503b8e80941Smrg	sctx->b.set_device_reset_callback = si_set_device_reset_callback;
504b8e80941Smrg
505b8e80941Smrg	si_init_all_descriptors(sctx);
506b8e80941Smrg	si_init_buffer_functions(sctx);
507b8e80941Smrg	si_init_clear_functions(sctx);
508848b8605Smrg	si_init_blit_functions(sctx);
509848b8605Smrg	si_init_compute_functions(sctx);
510b8e80941Smrg	si_init_compute_blit_functions(sctx);
511b8e80941Smrg	si_init_debug_functions(sctx);
512b8e80941Smrg	si_init_fence_functions(sctx);
513b8e80941Smrg	si_init_state_compute_functions(sctx);
514b8e80941Smrg
515b8e80941Smrg	/* Initialize graphics-only context functions. */
516b8e80941Smrg	if (sctx->has_graphics) {
517b8e80941Smrg		si_init_context_texture_functions(sctx);
518b8e80941Smrg		si_init_query_functions(sctx);
519b8e80941Smrg		si_init_msaa_functions(sctx);
520b8e80941Smrg		si_init_shader_functions(sctx);
521b8e80941Smrg		si_init_state_functions(sctx);
522b8e80941Smrg		si_init_streamout_functions(sctx);
523b8e80941Smrg		si_init_viewport_functions(sctx);
524848b8605Smrg
525b8e80941Smrg		sctx->blitter = util_blitter_create(&sctx->b);
526b8e80941Smrg		if (sctx->blitter == NULL)
527b8e80941Smrg			goto fail;
528b8e80941Smrg		sctx->blitter->skip_viewport_restore = true;
529b8e80941Smrg
530b8e80941Smrg		si_init_draw_functions(sctx);
531848b8605Smrg	}
532848b8605Smrg
533b8e80941Smrg	/* Initialize SDMA functions. */
534b8e80941Smrg	if (sctx->chip_class >= CIK)
535b8e80941Smrg		cik_init_sdma_functions(sctx);
536b8e80941Smrg	else
537b8e80941Smrg		si_init_dma_functions(sctx);
538848b8605Smrg
539b8e80941Smrg	if (sscreen->debug_flags & DBG(FORCE_DMA))
540b8e80941Smrg		sctx->b.resource_copy_region = sctx->dma_copy;
541848b8605Smrg
542b8e80941Smrg	sctx->sample_mask = 0xffff;
543848b8605Smrg
544b8e80941Smrg	/* Initialize multimedia functions. */
545b8e80941Smrg	if (sscreen->info.has_hw_decode) {
546b8e80941Smrg		sctx->b.create_video_codec = si_uvd_create_decoder;
547b8e80941Smrg		sctx->b.create_video_buffer = si_video_buffer_create;
548b8e80941Smrg	} else {
549b8e80941Smrg		sctx->b.create_video_codec = vl_create_decoder;
550b8e80941Smrg		sctx->b.create_video_buffer = vl_video_buffer_create;
551b8e80941Smrg	}
552848b8605Smrg
553b8e80941Smrg	if (sctx->chip_class >= GFX9) {
554b8e80941Smrg		sctx->wait_mem_scratch = si_resource(
555b8e80941Smrg			pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4));
556b8e80941Smrg		if (!sctx->wait_mem_scratch)
557b8e80941Smrg			goto fail;
558848b8605Smrg
559b8e80941Smrg		/* Initialize the memory. */
560b8e80941Smrg		si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4,
561b8e80941Smrg				 V_370_MEM, V_370_ME, &sctx->wait_mem_number);
562848b8605Smrg	}
563848b8605Smrg
564b8e80941Smrg	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
565b8e80941Smrg	 * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
566b8e80941Smrg	if (sctx->chip_class == CIK) {
567b8e80941Smrg		sctx->null_const_buf.buffer =
568b8e80941Smrg			pipe_aligned_buffer_create(screen,
569b8e80941Smrg						   SI_RESOURCE_FLAG_32BIT,
570b8e80941Smrg						   PIPE_USAGE_DEFAULT, 16,
571b8e80941Smrg						   sctx->screen->info.tcc_cache_line_size);
572b8e80941Smrg		if (!sctx->null_const_buf.buffer)
573b8e80941Smrg			goto fail;
574848b8605Smrg		sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0;
575848b8605Smrg
576b8e80941Smrg		unsigned start_shader = sctx->has_graphics ? 0 :  PIPE_SHADER_COMPUTE;
577b8e80941Smrg		for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) {
578848b8605Smrg			for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
579b8e80941Smrg				sctx->b.set_constant_buffer(&sctx->b, shader, i,
580848b8605Smrg							      &sctx->null_const_buf);
581848b8605Smrg			}
582848b8605Smrg		}
583848b8605Smrg
584b8e80941Smrg		si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
585b8e80941Smrg				 &sctx->null_const_buf);
586b8e80941Smrg		si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
587b8e80941Smrg				 &sctx->null_const_buf);
588b8e80941Smrg		si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
589b8e80941Smrg				 &sctx->null_const_buf);
590b8e80941Smrg		si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
591b8e80941Smrg				 &sctx->null_const_buf);
592b8e80941Smrg		si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
593b8e80941Smrg				 &sctx->null_const_buf);
594848b8605Smrg	}
595848b8605Smrg
596b8e80941Smrg	uint64_t max_threads_per_block;
597b8e80941Smrg	screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI,
598b8e80941Smrg				  PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
599b8e80941Smrg				  &max_threads_per_block);
600b8e80941Smrg
601b8e80941Smrg	/* The maximum number of scratch waves. Scratch space isn't divided
602b8e80941Smrg	 * evenly between CUs. The number is only a function of the number of CUs.
603b8e80941Smrg	 * We can decrease the constant to decrease the scratch buffer size.
604b8e80941Smrg	 *
605b8e80941Smrg	 * sctx->scratch_waves must be >= the maximum posible size of
606b8e80941Smrg	 * 1 threadgroup, so that the hw doesn't hang from being unable
607b8e80941Smrg	 * to start any.
608b8e80941Smrg	 *
609b8e80941Smrg	 * The recommended value is 4 per CU at most. Higher numbers don't
610b8e80941Smrg	 * bring much benefit, but they still occupy chip resources (think
611b8e80941Smrg	 * async compute). I've seen ~2% performance difference between 4 and 32.
612b8e80941Smrg	 */
613b8e80941Smrg	sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units,
614b8e80941Smrg				   max_threads_per_block / 64);
615b8e80941Smrg
616b8e80941Smrg	si_init_compiler(sscreen, &sctx->compiler);
617b8e80941Smrg
618b8e80941Smrg	/* Bindless handles. */
619b8e80941Smrg	sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
620b8e80941Smrg						    _mesa_key_pointer_equal);
621b8e80941Smrg	sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
622b8e80941Smrg						    _mesa_key_pointer_equal);
623b8e80941Smrg
624b8e80941Smrg	util_dynarray_init(&sctx->resident_tex_handles, NULL);
625b8e80941Smrg	util_dynarray_init(&sctx->resident_img_handles, NULL);
626b8e80941Smrg	util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
627b8e80941Smrg	util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
628b8e80941Smrg	util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
629b8e80941Smrg
630b8e80941Smrg	sctx->sample_pos_buffer =
631b8e80941Smrg		pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT,
632b8e80941Smrg				   sizeof(sctx->sample_positions));
633b8e80941Smrg	pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0,
634b8e80941Smrg			  sizeof(sctx->sample_positions), &sctx->sample_positions);
635b8e80941Smrg
636b8e80941Smrg	/* this must be last */
637b8e80941Smrg	si_begin_new_gfx_cs(sctx);
638b8e80941Smrg
639b8e80941Smrg	if (sctx->chip_class == CIK) {
640b8e80941Smrg		/* Clear the NULL constant buffer, because loads should return zeros.
641b8e80941Smrg		 * Note that this forces CP DMA to be used, because clover deadlocks
642b8e80941Smrg		 * for some reason when the compute codepath is used.
643b8e80941Smrg		 */
644b8e80941Smrg		uint32_t clear_value = 0;
645b8e80941Smrg		si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
646b8e80941Smrg				sctx->null_const_buf.buffer->width0,
647b8e80941Smrg				&clear_value, 4, SI_COHERENCY_SHADER, true);
648b8e80941Smrg	}
649b8e80941Smrg	return &sctx->b;
650848b8605Smrgfail:
651b8e80941Smrg	fprintf(stderr, "radeonsi: Failed to create a context.\n");
652b8e80941Smrg	si_destroy_context(&sctx->b);
653848b8605Smrg	return NULL;
654848b8605Smrg}
655848b8605Smrg
656b8e80941Smrgstatic struct pipe_context *si_pipe_create_context(struct pipe_screen *screen,
657b8e80941Smrg						   void *priv, unsigned flags)
658b8e80941Smrg{
659b8e80941Smrg	struct si_screen *sscreen = (struct si_screen *)screen;
660b8e80941Smrg	struct pipe_context *ctx;
661b8e80941Smrg
662b8e80941Smrg	if (sscreen->debug_flags & DBG(CHECK_VM))
663b8e80941Smrg		flags |= PIPE_CONTEXT_DEBUG;
664b8e80941Smrg
665b8e80941Smrg	ctx = si_create_context(screen, flags);
666b8e80941Smrg
667b8e80941Smrg	if (!(flags & PIPE_CONTEXT_PREFER_THREADED))
668b8e80941Smrg		return ctx;
669b8e80941Smrg
670b8e80941Smrg	/* Clover (compute-only) is unsupported. */
671b8e80941Smrg	if (flags & PIPE_CONTEXT_COMPUTE_ONLY)
672b8e80941Smrg		return ctx;
673b8e80941Smrg
674b8e80941Smrg	/* When shaders are logged to stderr, asynchronous compilation is
675b8e80941Smrg	 * disabled too. */
676b8e80941Smrg	if (sscreen->debug_flags & DBG_ALL_SHADERS)
677b8e80941Smrg		return ctx;
678b8e80941Smrg
679b8e80941Smrg	/* Use asynchronous flushes only on amdgpu, since the radeon
680b8e80941Smrg	 * implementation for fence_server_sync is incomplete. */
681b8e80941Smrg	return threaded_context_create(ctx, &sscreen->pool_transfers,
682b8e80941Smrg				       si_replace_buffer_storage,
683b8e80941Smrg				       sscreen->info.drm_major >= 3 ? si_create_fence : NULL,
684b8e80941Smrg				       &((struct si_context*)ctx)->tc);
685b8e80941Smrg}
686b8e80941Smrg
687848b8605Smrg/*
688848b8605Smrg * pipe_screen
689848b8605Smrg */
690b8e80941Smrgstatic void si_destroy_screen(struct pipe_screen* pscreen)
691848b8605Smrg{
692848b8605Smrg	struct si_screen *sscreen = (struct si_screen *)pscreen;
693b8e80941Smrg	struct si_shader_part *parts[] = {
694b8e80941Smrg		sscreen->vs_prologs,
695b8e80941Smrg		sscreen->tcs_epilogs,
696b8e80941Smrg		sscreen->gs_prologs,
697b8e80941Smrg		sscreen->ps_prologs,
698b8e80941Smrg		sscreen->ps_epilogs
699b8e80941Smrg	};
700b8e80941Smrg	unsigned i;
701b8e80941Smrg
702b8e80941Smrg	if (!sscreen->ws->unref(sscreen->ws))
703b8e80941Smrg		return;
704848b8605Smrg
705b8e80941Smrg	mtx_destroy(&sscreen->aux_context_lock);
706b8e80941Smrg
707b8e80941Smrg	struct u_log_context *aux_log = ((struct si_context *)sscreen->aux_context)->log;
708b8e80941Smrg	if (aux_log) {
709b8e80941Smrg		sscreen->aux_context->set_log_context(sscreen->aux_context, NULL);
710b8e80941Smrg		u_log_context_destroy(aux_log);
711b8e80941Smrg		FREE(aux_log);
712b8e80941Smrg	}
713b8e80941Smrg
714b8e80941Smrg	sscreen->aux_context->destroy(sscreen->aux_context);
715b8e80941Smrg
716b8e80941Smrg	util_queue_destroy(&sscreen->shader_compiler_queue);
717b8e80941Smrg	util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
718b8e80941Smrg
719b8e80941Smrg	for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++)
720b8e80941Smrg		si_destroy_compiler(&sscreen->compiler[i]);
721b8e80941Smrg
722b8e80941Smrg	for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++)
723b8e80941Smrg		si_destroy_compiler(&sscreen->compiler_lowp[i]);
724b8e80941Smrg
725b8e80941Smrg	/* Free shader parts. */
726b8e80941Smrg	for (i = 0; i < ARRAY_SIZE(parts); i++) {
727b8e80941Smrg		while (parts[i]) {
728b8e80941Smrg			struct si_shader_part *part = parts[i];
729b8e80941Smrg
730b8e80941Smrg			parts[i] = part->next;
731b8e80941Smrg			ac_shader_binary_clean(&part->binary);
732b8e80941Smrg			FREE(part);
733b8e80941Smrg		}
734b8e80941Smrg	}
735b8e80941Smrg	mtx_destroy(&sscreen->shader_parts_mutex);
736b8e80941Smrg	si_destroy_shader_cache(sscreen);
737b8e80941Smrg
738b8e80941Smrg	si_destroy_perfcounters(sscreen);
739b8e80941Smrg	si_gpu_load_kill_thread(sscreen);
740b8e80941Smrg
741b8e80941Smrg	mtx_destroy(&sscreen->gpu_load_mutex);
742b8e80941Smrg
743b8e80941Smrg	slab_destroy_parent(&sscreen->pool_transfers);
744b8e80941Smrg
745b8e80941Smrg	disk_cache_destroy(sscreen->disk_shader_cache);
746b8e80941Smrg	sscreen->ws->destroy(sscreen->ws);
747b8e80941Smrg	FREE(sscreen);
748848b8605Smrg}
749848b8605Smrg
750b8e80941Smrgstatic void si_init_gs_info(struct si_screen *sscreen)
751848b8605Smrg{
752b8e80941Smrg	sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class,
753b8e80941Smrg							sscreen->info.family);
754b8e80941Smrg}
755b8e80941Smrg
756b8e80941Smrgstatic void si_test_vmfault(struct si_screen *sscreen)
757b8e80941Smrg{
758b8e80941Smrg	struct pipe_context *ctx = sscreen->aux_context;
759b8e80941Smrg	struct si_context *sctx = (struct si_context *)ctx;
760b8e80941Smrg	struct pipe_resource *buf =
761b8e80941Smrg		pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64);
762b8e80941Smrg
763b8e80941Smrg	if (!buf) {
764b8e80941Smrg		puts("Buffer allocation failed.");
765b8e80941Smrg		exit(1);
766848b8605Smrg	}
767848b8605Smrg
768b8e80941Smrg	si_resource(buf)->gpu_address = 0; /* cause a VM fault */
769b8e80941Smrg
770b8e80941Smrg	if (sscreen->debug_flags & DBG(TEST_VMFAULT_CP)) {
771b8e80941Smrg		si_cp_dma_copy_buffer(sctx, buf, buf, 0, 4, 4, 0,
772b8e80941Smrg				      SI_COHERENCY_NONE, L2_BYPASS);
773b8e80941Smrg		ctx->flush(ctx, NULL, 0);
774b8e80941Smrg		puts("VM fault test: CP - done.");
775b8e80941Smrg	}
776b8e80941Smrg	if (sscreen->debug_flags & DBG(TEST_VMFAULT_SDMA)) {
777b8e80941Smrg		si_sdma_clear_buffer(sctx, buf, 0, 4, 0);
778b8e80941Smrg		ctx->flush(ctx, NULL, 0);
779b8e80941Smrg		puts("VM fault test: SDMA - done.");
780b8e80941Smrg	}
781b8e80941Smrg	if (sscreen->debug_flags & DBG(TEST_VMFAULT_SHADER)) {
782b8e80941Smrg		util_test_constant_buffer(ctx, buf);
783b8e80941Smrg		puts("VM fault test: Shader - done.");
784b8e80941Smrg	}
785b8e80941Smrg	exit(0);
786848b8605Smrg}
787848b8605Smrg
788b8e80941Smrgstatic void si_test_gds_memory_management(struct si_context *sctx,
789b8e80941Smrg					  unsigned alloc_size, unsigned alignment,
790b8e80941Smrg					  enum radeon_bo_domain domain)
791848b8605Smrg{
792b8e80941Smrg	struct radeon_winsys *ws = sctx->ws;
793b8e80941Smrg	struct radeon_cmdbuf *cs[8];
794b8e80941Smrg	struct pb_buffer *gds_bo[ARRAY_SIZE(cs)];
795b8e80941Smrg
796b8e80941Smrg	for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
797b8e80941Smrg		cs[i] = ws->cs_create(sctx->ctx, RING_COMPUTE,
798b8e80941Smrg				      NULL, NULL, false);
799b8e80941Smrg		gds_bo[i] = ws->buffer_create(ws, alloc_size, alignment, domain, 0);
800b8e80941Smrg		assert(gds_bo[i]);
801b8e80941Smrg	}
802848b8605Smrg
803b8e80941Smrg	for (unsigned iterations = 0; iterations < 20000; iterations++) {
804b8e80941Smrg		for (unsigned i = 0; i < ARRAY_SIZE(cs); i++) {
805b8e80941Smrg			/* This clears GDS with CP DMA.
806b8e80941Smrg			 *
807b8e80941Smrg			 * We don't care if GDS is present. Just add some packet
808b8e80941Smrg			 * to make the GPU busy for a moment.
809b8e80941Smrg			 */
810b8e80941Smrg			si_cp_dma_clear_buffer(sctx, cs[i], NULL, 0, alloc_size, 0,
811b8e80941Smrg					       SI_CPDMA_SKIP_BO_LIST_UPDATE |
812b8e80941Smrg					       SI_CPDMA_SKIP_CHECK_CS_SPACE |
813b8e80941Smrg					       SI_CPDMA_SKIP_GFX_SYNC, 0, 0);
814b8e80941Smrg
815b8e80941Smrg			ws->cs_add_buffer(cs[i], gds_bo[i], domain,
816b8e80941Smrg					  RADEON_USAGE_READWRITE, 0);
817b8e80941Smrg			ws->cs_flush(cs[i], PIPE_FLUSH_ASYNC, NULL);
818b8e80941Smrg		}
819b8e80941Smrg	}
820b8e80941Smrg	exit(0);
821b8e80941Smrg}
822b8e80941Smrg
823b8e80941Smrgstatic void si_disk_cache_create(struct si_screen *sscreen)
824b8e80941Smrg{
825b8e80941Smrg	/* Don't use the cache if shader dumping is enabled. */
826b8e80941Smrg	if (sscreen->debug_flags & DBG_ALL_SHADERS)
827848b8605Smrg		return;
828848b8605Smrg
829b8e80941Smrg	struct mesa_sha1 ctx;
830b8e80941Smrg	unsigned char sha1[20];
831b8e80941Smrg	char cache_id[20 * 2 + 1];
832b8e80941Smrg
833b8e80941Smrg	_mesa_sha1_init(&ctx);
834b8e80941Smrg
835b8e80941Smrg	if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) ||
836b8e80941Smrg	    !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo,
837b8e80941Smrg						&ctx))
838848b8605Smrg		return;
839848b8605Smrg
840b8e80941Smrg	_mesa_sha1_final(&ctx, sha1);
841b8e80941Smrg	disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
842848b8605Smrg
843b8e80941Smrg	/* These flags affect shader compilation. */
844b8e80941Smrg	#define ALL_FLAGS (DBG(FS_CORRECT_DERIVS_AFTER_KILL) |	\
845b8e80941Smrg			   DBG(SI_SCHED) |			\
846b8e80941Smrg			   DBG(GISEL) |				\
847b8e80941Smrg			   DBG(UNSAFE_MATH))
848b8e80941Smrg	uint64_t shader_debug_flags = sscreen->debug_flags &
849b8e80941Smrg		ALL_FLAGS;
850848b8605Smrg
851b8e80941Smrg	/* Add the high bits of 32-bit addresses, which affects
852b8e80941Smrg	 * how 32-bit addresses are expanded to 64 bits.
853b8e80941Smrg	 */
854b8e80941Smrg	STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
855b8e80941Smrg	assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
856b8e80941Smrg	shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
857b8e80941Smrg
858b8e80941Smrg	if (sscreen->options.enable_nir)
859b8e80941Smrg		shader_debug_flags |= 1ull << 48;
860b8e80941Smrg
861b8e80941Smrg	sscreen->disk_shader_cache =
862b8e80941Smrg		disk_cache_create(sscreen->info.name,
863b8e80941Smrg				  cache_id,
864b8e80941Smrg				  shader_debug_flags);
865b8e80941Smrg}
866b8e80941Smrg
867b8e80941Smrgstatic void si_set_max_shader_compiler_threads(struct pipe_screen *screen,
868b8e80941Smrg					       unsigned max_threads)
869848b8605Smrg{
870b8e80941Smrg	struct si_screen *sscreen = (struct si_screen *)screen;
871848b8605Smrg
872b8e80941Smrg	/* This function doesn't allow a greater number of threads than
873b8e80941Smrg	 * the queue had at its creation. */
874b8e80941Smrg	util_queue_adjust_num_threads(&sscreen->shader_compiler_queue,
875b8e80941Smrg				      max_threads);
876b8e80941Smrg	/* Don't change the number of threads on the low priority queue. */
877b8e80941Smrg}
878848b8605Smrg
879b8e80941Smrgstatic bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen,
880b8e80941Smrg						       void *shader,
881b8e80941Smrg						       unsigned shader_type)
882b8e80941Smrg{
883b8e80941Smrg	if (shader_type == PIPE_SHADER_COMPUTE) {
884b8e80941Smrg		struct si_compute *cs = (struct si_compute*)shader;
885848b8605Smrg
886b8e80941Smrg		return util_queue_fence_is_signalled(&cs->ready);
887848b8605Smrg	}
888b8e80941Smrg	struct si_shader_selector *sel = (struct si_shader_selector *)shader;
889b8e80941Smrg
890b8e80941Smrg	return util_queue_fence_is_signalled(&sel->ready);
891848b8605Smrg}
892848b8605Smrg
893b8e80941Smrgstruct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
894b8e80941Smrg					   const struct pipe_screen_config *config)
895848b8605Smrg{
896848b8605Smrg	struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
897b8e80941Smrg	unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads, i;
898b8e80941Smrg
899b8e80941Smrg	if (!sscreen) {
900848b8605Smrg		return NULL;
901848b8605Smrg	}
902848b8605Smrg
903b8e80941Smrg	sscreen->ws = ws;
904b8e80941Smrg	ws->query_info(ws, &sscreen->info);
905b8e80941Smrg
906b8e80941Smrg	if (sscreen->info.chip_class >= GFX9) {
907b8e80941Smrg		sscreen->se_tile_repeat = 32 * sscreen->info.max_se;
908b8e80941Smrg	} else {
909b8e80941Smrg		ac_get_raster_config(&sscreen->info,
910b8e80941Smrg				     &sscreen->pa_sc_raster_config,
911b8e80941Smrg				     &sscreen->pa_sc_raster_config_1,
912b8e80941Smrg				     &sscreen->se_tile_repeat);
913b8e80941Smrg	}
914b8e80941Smrg
915b8e80941Smrg	sscreen->debug_flags = debug_get_flags_option("R600_DEBUG",
916b8e80941Smrg						      debug_options, 0);
917b8e80941Smrg	sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG",
918b8e80941Smrg						       debug_options, 0);
919b8e80941Smrg
920848b8605Smrg	/* Set functions first. */
921b8e80941Smrg	sscreen->b.context_create = si_pipe_create_context;
922b8e80941Smrg	sscreen->b.destroy = si_destroy_screen;
923b8e80941Smrg	sscreen->b.set_max_shader_compiler_threads =
924b8e80941Smrg		si_set_max_shader_compiler_threads;
925b8e80941Smrg	sscreen->b.is_parallel_shader_compilation_finished =
926b8e80941Smrg		si_is_parallel_shader_compilation_finished;
927b8e80941Smrg
928b8e80941Smrg	si_init_screen_get_functions(sscreen);
929b8e80941Smrg	si_init_screen_buffer_functions(sscreen);
930b8e80941Smrg	si_init_screen_fence_functions(sscreen);
931b8e80941Smrg	si_init_screen_state_functions(sscreen);
932b8e80941Smrg	si_init_screen_texture_functions(sscreen);
933b8e80941Smrg	si_init_screen_query_functions(sscreen);
934b8e80941Smrg
935b8e80941Smrg	/* Set these flags in debug_flags early, so that the shader cache takes
936b8e80941Smrg	 * them into account.
937b8e80941Smrg	 */
938b8e80941Smrg	if (driQueryOptionb(config->options,
939b8e80941Smrg			    "glsl_correct_derivatives_after_discard"))
940b8e80941Smrg		sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL);
941b8e80941Smrg	if (driQueryOptionb(config->options, "radeonsi_enable_sisched"))
942b8e80941Smrg		sscreen->debug_flags |= DBG(SI_SCHED);
943b8e80941Smrg
944b8e80941Smrg	if (sscreen->debug_flags & DBG(INFO))
945b8e80941Smrg		ac_print_gpu_info(&sscreen->info);
946b8e80941Smrg
947b8e80941Smrg	slab_create_parent(&sscreen->pool_transfers,
948b8e80941Smrg			   sizeof(struct si_transfer), 64);
949b8e80941Smrg
950b8e80941Smrg	sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
951b8e80941Smrg	if (sscreen->force_aniso >= 0) {
952b8e80941Smrg		printf("radeonsi: Forcing anisotropy filter to %ix\n",
953b8e80941Smrg		       /* round down to a power of two */
954b8e80941Smrg		       1 << util_logbase2(sscreen->force_aniso));
955b8e80941Smrg	}
956b8e80941Smrg
957b8e80941Smrg	(void) mtx_init(&sscreen->aux_context_lock, mtx_plain);
958b8e80941Smrg	(void) mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
959b8e80941Smrg
960b8e80941Smrg	si_init_gs_info(sscreen);
961b8e80941Smrg	if (!si_init_shader_cache(sscreen)) {
962848b8605Smrg		FREE(sscreen);
963848b8605Smrg		return NULL;
964848b8605Smrg	}
965848b8605Smrg
966b8e80941Smrg	si_disk_cache_create(sscreen);
967b8e80941Smrg
968b8e80941Smrg	/* Determine the number of shader compiler threads. */
969b8e80941Smrg	hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
970b8e80941Smrg
971b8e80941Smrg	if (hw_threads >= 12) {
972b8e80941Smrg		num_comp_hi_threads = hw_threads * 3 / 4;
973b8e80941Smrg		num_comp_lo_threads = hw_threads / 3;
974b8e80941Smrg	} else if (hw_threads >= 6) {
975b8e80941Smrg		num_comp_hi_threads = hw_threads - 2;
976b8e80941Smrg		num_comp_lo_threads = hw_threads / 2;
977b8e80941Smrg	} else if (hw_threads >= 2) {
978b8e80941Smrg		num_comp_hi_threads = hw_threads - 1;
979b8e80941Smrg		num_comp_lo_threads = hw_threads / 2;
980b8e80941Smrg	} else {
981b8e80941Smrg		num_comp_hi_threads = 1;
982b8e80941Smrg		num_comp_lo_threads = 1;
983b8e80941Smrg	}
984b8e80941Smrg
985b8e80941Smrg	num_comp_hi_threads = MIN2(num_comp_hi_threads,
986b8e80941Smrg				   ARRAY_SIZE(sscreen->compiler));
987b8e80941Smrg	num_comp_lo_threads = MIN2(num_comp_lo_threads,
988b8e80941Smrg				   ARRAY_SIZE(sscreen->compiler_lowp));
989b8e80941Smrg
990b8e80941Smrg	if (!util_queue_init(&sscreen->shader_compiler_queue, "sh",
991b8e80941Smrg			     64, num_comp_hi_threads,
992b8e80941Smrg			     UTIL_QUEUE_INIT_RESIZE_IF_FULL |
993b8e80941Smrg			     UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
994b8e80941Smrg		si_destroy_shader_cache(sscreen);
995b8e80941Smrg		FREE(sscreen);
996b8e80941Smrg		return NULL;
997b8e80941Smrg	}
998b8e80941Smrg
999b8e80941Smrg	if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
1000b8e80941Smrg			     "shlo",
1001b8e80941Smrg			     64, num_comp_lo_threads,
1002b8e80941Smrg			     UTIL_QUEUE_INIT_RESIZE_IF_FULL |
1003b8e80941Smrg			     UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY |
1004b8e80941Smrg			     UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
1005b8e80941Smrg	       si_destroy_shader_cache(sscreen);
1006b8e80941Smrg	       FREE(sscreen);
1007b8e80941Smrg	       return NULL;
1008b8e80941Smrg	}
1009b8e80941Smrg
1010b8e80941Smrg	if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
1011b8e80941Smrg		si_init_perfcounters(sscreen);
1012b8e80941Smrg
1013b8e80941Smrg	/* Determine tessellation ring info. */
1014b8e80941Smrg	bool double_offchip_buffers = sscreen->info.chip_class >= CIK &&
1015b8e80941Smrg				      sscreen->info.family != CHIP_CARRIZO &&
1016b8e80941Smrg				      sscreen->info.family != CHIP_STONEY;
1017b8e80941Smrg	/* This must be one less than the maximum number due to a hw limitation.
1018b8e80941Smrg	 * Various hardware bugs in SI, CIK, and GFX9 need this.
1019b8e80941Smrg	 */
1020b8e80941Smrg	unsigned max_offchip_buffers_per_se;
1021b8e80941Smrg
1022b8e80941Smrg	/* Only certain chips can use the maximum value. */
1023b8e80941Smrg	if (sscreen->info.family == CHIP_VEGA12 ||
1024b8e80941Smrg	    sscreen->info.family == CHIP_VEGA20)
1025b8e80941Smrg		max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
1026b8e80941Smrg	else
1027b8e80941Smrg		max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
1028b8e80941Smrg
1029b8e80941Smrg	unsigned max_offchip_buffers = max_offchip_buffers_per_se *
1030b8e80941Smrg				       sscreen->info.max_se;
1031b8e80941Smrg	unsigned offchip_granularity;
1032b8e80941Smrg
1033b8e80941Smrg	/* Hawaii has a bug with offchip buffers > 256 that can be worked
1034b8e80941Smrg	 * around by setting 4K granularity.
1035b8e80941Smrg	 */
1036b8e80941Smrg	if (sscreen->info.family == CHIP_HAWAII) {
1037b8e80941Smrg		sscreen->tess_offchip_block_dw_size = 4096;
1038b8e80941Smrg		offchip_granularity = V_03093C_X_4K_DWORDS;
1039b8e80941Smrg	} else {
1040b8e80941Smrg		sscreen->tess_offchip_block_dw_size = 8192;
1041b8e80941Smrg		offchip_granularity = V_03093C_X_8K_DWORDS;
1042b8e80941Smrg	}
1043b8e80941Smrg
1044b8e80941Smrg	sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se;
1045b8e80941Smrg	assert(((sscreen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
1046b8e80941Smrg	sscreen->tess_offchip_ring_size = max_offchip_buffers *
1047b8e80941Smrg					  sscreen->tess_offchip_block_dw_size * 4;
1048b8e80941Smrg
1049b8e80941Smrg	if (sscreen->info.chip_class >= CIK) {
1050b8e80941Smrg		if (sscreen->info.chip_class >= VI)
1051b8e80941Smrg			--max_offchip_buffers;
1052b8e80941Smrg		sscreen->vgt_hs_offchip_param =
1053b8e80941Smrg			S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) |
1054b8e80941Smrg			S_03093C_OFFCHIP_GRANULARITY(offchip_granularity);
1055b8e80941Smrg	} else {
1056b8e80941Smrg		assert(offchip_granularity == V_03093C_X_8K_DWORDS);
1057b8e80941Smrg		sscreen->vgt_hs_offchip_param =
1058b8e80941Smrg			S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
1059b8e80941Smrg	}
1060b8e80941Smrg
1061b8e80941Smrg	/* The mere presense of CLEAR_STATE in the IB causes random GPU hangs
1062b8e80941Smrg        * on SI. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
1063b8e80941Smrg        * SPI_VS_OUT_CONFIG. So only enable CI CLEAR_STATE on amdgpu kernel.*/
1064b8e80941Smrg       sscreen->has_clear_state = sscreen->info.chip_class >= CIK &&
1065b8e80941Smrg                                  sscreen->info.drm_major == 3;
1066b8e80941Smrg
1067b8e80941Smrg	sscreen->has_distributed_tess =
1068b8e80941Smrg		sscreen->info.chip_class >= VI &&
1069b8e80941Smrg		sscreen->info.max_se >= 2;
1070b8e80941Smrg
1071b8e80941Smrg	sscreen->has_draw_indirect_multi =
1072b8e80941Smrg		(sscreen->info.family >= CHIP_POLARIS10) ||
1073b8e80941Smrg		(sscreen->info.chip_class == VI &&
1074b8e80941Smrg		 sscreen->info.pfp_fw_version >= 121 &&
1075b8e80941Smrg		 sscreen->info.me_fw_version >= 87) ||
1076b8e80941Smrg		(sscreen->info.chip_class == CIK &&
1077b8e80941Smrg		 sscreen->info.pfp_fw_version >= 211 &&
1078b8e80941Smrg		 sscreen->info.me_fw_version >= 173) ||
1079b8e80941Smrg		(sscreen->info.chip_class == SI &&
1080b8e80941Smrg		 sscreen->info.pfp_fw_version >= 79 &&
1081b8e80941Smrg		 sscreen->info.me_fw_version >= 142);
1082b8e80941Smrg
1083b8e80941Smrg	sscreen->has_out_of_order_rast = sscreen->info.chip_class >= VI &&
1084b8e80941Smrg					 sscreen->info.max_se >= 2 &&
1085b8e80941Smrg					 !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER));
1086b8e80941Smrg	sscreen->assume_no_z_fights =
1087b8e80941Smrg		driQueryOptionb(config->options, "radeonsi_assume_no_z_fights");
1088b8e80941Smrg	sscreen->commutative_blend_add =
1089b8e80941Smrg		driQueryOptionb(config->options, "radeonsi_commutative_blend_add");
1090b8e80941Smrg
1091b8e80941Smrg	{
1092b8e80941Smrg#define OPT_BOOL(name, dflt, description) \
1093b8e80941Smrg		sscreen->options.name = \
1094b8e80941Smrg			driQueryOptionb(config->options, "radeonsi_"#name);
1095b8e80941Smrg#include "si_debug_options.h"
1096b8e80941Smrg	}
1097b8e80941Smrg
1098b8e80941Smrg	sscreen->has_gfx9_scissor_bug = sscreen->info.family == CHIP_VEGA10 ||
1099b8e80941Smrg					sscreen->info.family == CHIP_RAVEN;
1100b8e80941Smrg	sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= CHIP_POLARIS10 &&
1101b8e80941Smrg					    sscreen->info.family <= CHIP_POLARIS12) ||
1102b8e80941Smrg					   sscreen->info.family == CHIP_VEGA10 ||
1103b8e80941Smrg					   sscreen->info.family == CHIP_RAVEN;
1104b8e80941Smrg	sscreen->has_ls_vgpr_init_bug = sscreen->info.family == CHIP_VEGA10 ||
1105b8e80941Smrg					sscreen->info.family == CHIP_RAVEN;
1106b8e80941Smrg	sscreen->has_dcc_constant_encode = sscreen->info.family == CHIP_RAVEN2;
1107b8e80941Smrg
1108b8e80941Smrg	/* Only enable primitive binning on APUs by default. */
1109b8e80941Smrg	sscreen->dpbb_allowed = sscreen->info.family == CHIP_RAVEN ||
1110b8e80941Smrg				sscreen->info.family == CHIP_RAVEN2;
1111b8e80941Smrg
1112b8e80941Smrg	sscreen->dfsm_allowed = sscreen->info.family == CHIP_RAVEN ||
1113b8e80941Smrg				sscreen->info.family == CHIP_RAVEN2;
1114b8e80941Smrg
1115b8e80941Smrg	/* Process DPBB enable flags. */
1116b8e80941Smrg	if (sscreen->debug_flags & DBG(DPBB)) {
1117b8e80941Smrg		sscreen->dpbb_allowed = true;
1118b8e80941Smrg		if (sscreen->debug_flags & DBG(DFSM))
1119b8e80941Smrg			sscreen->dfsm_allowed = true;
1120b8e80941Smrg	}
1121b8e80941Smrg
1122b8e80941Smrg	/* Process DPBB disable flags. */
1123b8e80941Smrg	if (sscreen->debug_flags & DBG(NO_DPBB)) {
1124b8e80941Smrg		sscreen->dpbb_allowed = false;
1125b8e80941Smrg		sscreen->dfsm_allowed = false;
1126b8e80941Smrg	} else if (sscreen->debug_flags & DBG(NO_DFSM)) {
1127b8e80941Smrg		sscreen->dfsm_allowed = false;
1128b8e80941Smrg	}
1129b8e80941Smrg
1130b8e80941Smrg	/* While it would be nice not to have this flag, we are constrained
1131b8e80941Smrg	 * by the reality that LLVM 5.0 doesn't have working VGPR indexing
1132b8e80941Smrg	 * on GFX9.
1133b8e80941Smrg	 */
1134b8e80941Smrg	sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class <= VI;
1135848b8605Smrg
1136b8e80941Smrg	/* Some chips have RB+ registers, but don't support RB+. Those must
1137b8e80941Smrg	 * always disable it.
1138b8e80941Smrg	 */
1139b8e80941Smrg	if (sscreen->info.family == CHIP_STONEY ||
1140b8e80941Smrg	    sscreen->info.chip_class >= GFX9) {
1141b8e80941Smrg		sscreen->has_rbplus = true;
1142b8e80941Smrg
1143b8e80941Smrg		sscreen->rbplus_allowed =
1144b8e80941Smrg			!(sscreen->debug_flags & DBG(NO_RB_PLUS)) &&
1145b8e80941Smrg			(sscreen->info.family == CHIP_STONEY ||
1146b8e80941Smrg			 sscreen->info.family == CHIP_VEGA12 ||
1147b8e80941Smrg			 sscreen->info.family == CHIP_RAVEN ||
1148b8e80941Smrg			 sscreen->info.family == CHIP_RAVEN2);
1149b8e80941Smrg	}
1150b8e80941Smrg
1151b8e80941Smrg	sscreen->dcc_msaa_allowed =
1152b8e80941Smrg		!(sscreen->debug_flags & DBG(NO_DCC_MSAA));
1153b8e80941Smrg
1154b8e80941Smrg	sscreen->cpdma_prefetch_writes_memory = sscreen->info.chip_class <= VI;
1155b8e80941Smrg
1156b8e80941Smrg	(void) mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
1157b8e80941Smrg	sscreen->use_monolithic_shaders =
1158b8e80941Smrg		(sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
1159b8e80941Smrg
1160b8e80941Smrg	sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 |
1161b8e80941Smrg					    SI_CONTEXT_INV_VMEM_L1;
1162b8e80941Smrg	if (sscreen->info.chip_class <= VI) {
1163b8e80941Smrg		sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2;
1164b8e80941Smrg		sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
1165b8e80941Smrg	}
1166b8e80941Smrg
1167b8e80941Smrg	if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
1168b8e80941Smrg		sscreen->debug_flags |= DBG_ALL_SHADERS;
1169b8e80941Smrg
1170b8e80941Smrg	/* Syntax:
1171b8e80941Smrg	 *     EQAA=s,z,c
1172b8e80941Smrg	 * Example:
1173b8e80941Smrg	 *     EQAA=8,4,2
1174b8e80941Smrg
1175b8e80941Smrg	 * That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
1176b8e80941Smrg	 * Constraints:
1177b8e80941Smrg	 *     s >= z >= c (ignoring this only wastes memory)
1178b8e80941Smrg	 *     s = [2..16]
1179b8e80941Smrg	 *     z = [2..8]
1180b8e80941Smrg	 *     c = [2..8]
1181b8e80941Smrg	 *
1182b8e80941Smrg	 * Only MSAA color and depth buffers are overriden.
1183b8e80941Smrg	 */
1184b8e80941Smrg	if (sscreen->info.has_eqaa_surface_allocator) {
1185b8e80941Smrg		const char *eqaa = debug_get_option("EQAA", NULL);
1186b8e80941Smrg		unsigned s,z,f;
1187b8e80941Smrg
1188b8e80941Smrg		if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
1189b8e80941Smrg			sscreen->eqaa_force_coverage_samples = s;
1190b8e80941Smrg			sscreen->eqaa_force_z_samples = z;
1191b8e80941Smrg			sscreen->eqaa_force_color_samples = f;
1192b8e80941Smrg		}
1193b8e80941Smrg	}
1194b8e80941Smrg
1195b8e80941Smrg	for (i = 0; i < num_comp_hi_threads; i++)
1196b8e80941Smrg		si_init_compiler(sscreen, &sscreen->compiler[i]);
1197b8e80941Smrg	for (i = 0; i < num_comp_lo_threads; i++)
1198b8e80941Smrg		si_init_compiler(sscreen, &sscreen->compiler_lowp[i]);
1199848b8605Smrg
1200848b8605Smrg	/* Create the auxiliary context. This must be done last. */
1201b8e80941Smrg	sscreen->aux_context = si_create_context(
1202b8e80941Smrg		&sscreen->b, sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0);
1203b8e80941Smrg	if (sscreen->options.aux_debug) {
1204b8e80941Smrg		struct u_log_context *log = CALLOC_STRUCT(u_log_context);
1205b8e80941Smrg		u_log_context_init(log);
1206b8e80941Smrg		sscreen->aux_context->set_log_context(sscreen->aux_context, log);
1207b8e80941Smrg	}
1208b8e80941Smrg
1209b8e80941Smrg	if (sscreen->debug_flags & DBG(TEST_DMA))
1210b8e80941Smrg		si_test_dma(sscreen);
1211b8e80941Smrg
1212b8e80941Smrg	if (sscreen->debug_flags & DBG(TEST_DMA_PERF)) {
1213b8e80941Smrg		si_test_dma_perf(sscreen);
1214b8e80941Smrg	}
1215b8e80941Smrg
1216b8e80941Smrg	if (sscreen->debug_flags & (DBG(TEST_VMFAULT_CP) |
1217b8e80941Smrg				      DBG(TEST_VMFAULT_SDMA) |
1218b8e80941Smrg				      DBG(TEST_VMFAULT_SHADER)))
1219b8e80941Smrg		si_test_vmfault(sscreen);
1220b8e80941Smrg
1221b8e80941Smrg	if (sscreen->debug_flags & DBG(TEST_GDS))
1222b8e80941Smrg		si_test_gds((struct si_context*)sscreen->aux_context);
1223b8e80941Smrg
1224b8e80941Smrg	if (sscreen->debug_flags & DBG(TEST_GDS_MM)) {
1225b8e80941Smrg		si_test_gds_memory_management((struct si_context*)sscreen->aux_context,
1226b8e80941Smrg					      32 * 1024, 4, RADEON_DOMAIN_GDS);
1227b8e80941Smrg	}
1228b8e80941Smrg	if (sscreen->debug_flags & DBG(TEST_GDS_OA_MM)) {
1229b8e80941Smrg		si_test_gds_memory_management((struct si_context*)sscreen->aux_context,
1230b8e80941Smrg					      4, 1, RADEON_DOMAIN_OA);
1231b8e80941Smrg	}
1232848b8605Smrg
1233b8e80941Smrg	return &sscreen->b;
1234848b8605Smrg}
1235