1/************************************************************************** 2 * 3 * Copyright 2017 Advanced Micro Devices, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * on the rights to use, copy, modify, merge, publish, distribute, sub 10 * license, and/or sell copies of the Software, and to permit persons to whom 11 * the Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 23 * USE OR OTHER DEALINGS IN THE SOFTWARE. 24 * 25 **************************************************************************/ 26 27/* This is a wrapper for pipe_context that executes all pipe_context calls 28 * in another thread. 29 * 30 * 31 * Guidelines for adopters and deviations from Gallium 32 * --------------------------------------------------- 33 * 34 * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen 35 * driver functions that take a context (fence_finish, texture_get_handle) 36 * should manually unwrap pipe_context by doing: 37 * pipe = threaded_context_unwrap_sync(pipe); 38 * 39 * pipe_context::priv is used to unwrap the context, so drivers and state 40 * trackers shouldn't use it. 41 * 42 * No other objects are wrapped. 43 * 44 * 2) Drivers must subclass and initialize these structures: 45 * - threaded_resource for pipe_resource (use threaded_resource_init/deinit) 46 * - threaded_query for pipe_query (zero memory) 47 * - threaded_transfer for pipe_transfer (zero memory) 48 * 49 * 3) The threaded context must not be enabled for contexts that can use video 50 * codecs. 51 * 52 * 4) Changes in driver behavior: 53 * - begin_query and end_query always return true; return values from 54 * the driver are ignored. 55 * - generate_mipmap uses is_format_supported to determine success; 56 * the return value from the driver is ignored. 57 * - resource_commit always returns true; failures are ignored. 58 * - set_debug_callback is skipped if the callback is synchronous. 59 * 60 * 61 * Thread-safety requirements on context functions 62 * ----------------------------------------------- 63 * 64 * These pipe_context functions are executed directly, so they shouldn't use 65 * pipe_context in an unsafe way. They are de-facto screen functions now: 66 * - create_query 67 * - create_batch_query 68 * - create_*_state (all CSOs and shaders) 69 * - Make sure the shader compiler doesn't use any per-context stuff. 70 * (e.g. LLVM target machine) 71 * - Only pipe_context's debug callback for shader dumps is guaranteed to 72 * be up to date, because set_debug_callback synchronizes execution. 73 * - create_surface 74 * - surface_destroy 75 * - create_sampler_view 76 * - sampler_view_destroy 77 * - stream_output_target_destroy 78 * - transfer_map (only unsychronized buffer mappings) 79 * - get_query_result (when threaded_query::flushed == true) 80 * - create_stream_output_target 81 * 82 * 83 * Transfer_map rules for buffer mappings 84 * -------------------------------------- 85 * 86 * 1) If transfer_map has PIPE_MAP_UNSYNCHRONIZED, the call is made 87 * in the non-driver thread without flushing the queue. The driver will 88 * receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_MAP_- 89 * UNSYNCHRONIZED to indicate this. 90 * Note that transfer_unmap is always enqueued and called from the driver 91 * thread. 92 * 93 * 2) The driver isn't allowed to infer unsychronized mappings by tracking 94 * the valid buffer range. The threaded context always sends TC_TRANSFER_- 95 * MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead 96 * to failures. 97 * The threaded context does its own detection of unsynchronized mappings. 98 * 99 * 3) The driver isn't allowed to do buffer invalidations by itself under any 100 * circumstances. This is necessary for unsychronized maps to map the latest 101 * version of the buffer. (because invalidations can be queued, while 102 * unsychronized maps are not queued and they should return the latest 103 * storage after invalidation). The threaded context always sends 104 * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to 105 * indicate this. Ignoring the flag will lead to failures. 106 * The threaded context uses its own buffer invalidation mechanism. 107 * 108 * 4) PIPE_MAP_ONCE can no longer be used to infer that a buffer will not be mapped 109 * a second time before it is unmapped. 110 * 111 * 112 * Rules for fences 113 * ---------------- 114 * 115 * Flushes will be executed asynchronously in the driver thread if a 116 * create_fence callback is provided. This affects fence semantics as follows. 117 * 118 * When the threaded context wants to perform an asynchronous flush, it will 119 * use the create_fence callback to pre-create the fence from the calling 120 * thread. This pre-created fence will be passed to pipe_context::flush 121 * together with the TC_FLUSH_ASYNC flag. 122 * 123 * The callback receives the unwrapped context as a parameter, but must use it 124 * in a thread-safe way because it is called from a non-driver thread. 125 * 126 * If the threaded_context does not immediately flush the current batch, the 127 * callback also receives a tc_unflushed_batch_token. If fence_finish is called 128 * on the returned fence in the context that created the fence, 129 * threaded_context_flush must be called. 130 * 131 * The driver must implement pipe_context::fence_server_sync properly, since 132 * the threaded context handles PIPE_FLUSH_ASYNC. 133 * 134 * 135 * Additional requirements 136 * ----------------------- 137 * 138 * get_query_result: 139 * If threaded_query::flushed == true, get_query_result should assume that 140 * it's called from a non-driver thread, in which case the driver shouldn't 141 * use the context in an unsafe way. 142 * 143 * replace_buffer_storage: 144 * The driver has to implement this callback, which will be called when 145 * the threaded context wants to replace a resource's backing storage with 146 * another resource's backing storage. The threaded context uses it to 147 * implement buffer invalidation. This call is always queued. 148 * Note that 'minimum_num_rebinds' specifies only the minimum number of rebinds 149 * which must be managed by the driver; if a buffer is bound multiple times in 150 * the same binding point (e.g., vertex buffer slots 0,1,2), this will be counted 151 * as a single rebind. 152 * 153 * 154 * Optional resource busy callbacks for better performance 155 * ------------------------------------------------------- 156 * 157 * This adds checking whether a resource is used by the GPU and whether 158 * a resource is referenced by an unflushed command buffer. If neither is true, 159 * the threaded context will map the buffer as UNSYNCHRONIZED without flushing 160 * or synchronizing the thread and will skip any buffer invalidations 161 * (reallocations) because invalidating an idle buffer has no benefit. 162 * 163 * There are 1 driver callback and 1 TC callback: 164 * 165 * 1) is_resource_busy: It returns true when a resource is busy. If this is NULL, 166 * the resource is considered always busy. 167 * 168 * 2) tc_driver_internal_flush_notify: If the driver set 169 * driver_calls_flush_notify = true in threaded_context_create, it should 170 * call this after every internal driver flush. The threaded context uses it 171 * to track internal driver flushes for the purpose of tracking which 172 * buffers are referenced by an unflushed command buffer. 173 * 174 * If is_resource_busy is set, threaded_resource::buffer_id_unique must be 175 * generated by the driver, and the replace_buffer_storage callback should 176 * delete the buffer ID passed to it. The driver should use 177 * util_idalloc_mt_init_tc. 178 * 179 * 180 * How it works (queue architecture) 181 * --------------------------------- 182 * 183 * There is a multithreaded queue consisting of batches, each batch containing 184 * 8-byte slots. Calls can occupy 1 or more slots. 185 * 186 * Once a batch is full and there is no space for the next call, it's flushed, 187 * meaning that it's added to the queue for execution in the other thread. 188 * The batches are ordered in a ring and reused once they are idle again. 189 * The batching is necessary for low queue/mutex overhead. 190 */ 191 192#ifndef U_THREADED_CONTEXT_H 193#define U_THREADED_CONTEXT_H 194 195#include "c11/threads.h" 196#include "pipe/p_context.h" 197#include "pipe/p_state.h" 198#include "util/bitset.h" 199#include "util/u_inlines.h" 200#include "util/u_queue.h" 201#include "util/u_range.h" 202#include "util/u_thread.h" 203#include "util/slab.h" 204 205struct threaded_context; 206struct tc_unflushed_batch_token; 207 208/* 0 = disabled, 1 = assertions, 2 = printfs, 3 = logging */ 209#define TC_DEBUG 0 210 211/* These are map flags sent to drivers. */ 212/* Never infer whether it's safe to use unsychronized mappings: */ 213#define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29) 214/* Don't invalidate buffers: */ 215#define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30) 216/* transfer_map is called from a non-driver thread: */ 217#define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31) 218 219/* Custom flush flags sent to drivers. */ 220/* fence is pre-populated with a fence created by the create_fence callback */ 221#define TC_FLUSH_ASYNC (1u << 31) 222 223/* Size of the queue = number of batch slots in memory. 224 * - 1 batch is always idle and records new commands 225 * - 1 batch is being executed 226 * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches. 227 * 228 * Use a size as small as possible for low CPU L2 cache usage but large enough 229 * so that the queue isn't stalled too often for not having enough idle batch 230 * slots. 231 */ 232#define TC_MAX_BATCHES 10 233 234/* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer) 235 * can occupy multiple call slots. 236 * 237 * The idea is to have batches as small as possible but large enough so that 238 * the queuing and mutex overhead is negligible. 239 */ 240#define TC_SLOTS_PER_BATCH 1536 241 242/* The buffer list queue is much deeper than the batch queue because buffer 243 * lists need to stay around until the driver internally flushes its command 244 * buffer. 245 */ 246#define TC_MAX_BUFFER_LISTS (TC_MAX_BATCHES * 4) 247 248/* This mask is used to get a hash of a buffer ID. It's also the bit size of 249 * the buffer list - 1. It must be 2^n - 1. The size should be as low as 250 * possible to minimize memory usage, but high enough to minimize hash 251 * collisions. 252 */ 253#define TC_BUFFER_ID_MASK BITFIELD_MASK(14) 254 255/* Threshold for when to use the queue or sync. */ 256#define TC_MAX_STRING_MARKER_BYTES 512 257 258/* Threshold for when to enqueue buffer/texture_subdata as-is. 259 * If the upload size is greater than this, it will do instead: 260 * - for buffers: DISCARD_RANGE is done by the threaded context 261 * - for textures: sync and call the driver directly 262 */ 263#define TC_MAX_SUBDATA_BYTES 320 264 265enum tc_binding_type { 266 TC_BINDING_VERTEX_BUFFER, 267 TC_BINDING_STREAMOUT_BUFFER, 268 TC_BINDING_UBO_VS, 269 TC_BINDING_UBO_FS, 270 TC_BINDING_UBO_GS, 271 TC_BINDING_UBO_TCS, 272 TC_BINDING_UBO_TES, 273 TC_BINDING_UBO_CS, 274 TC_BINDING_SAMPLERVIEW_VS, 275 TC_BINDING_SAMPLERVIEW_FS, 276 TC_BINDING_SAMPLERVIEW_GS, 277 TC_BINDING_SAMPLERVIEW_TCS, 278 TC_BINDING_SAMPLERVIEW_TES, 279 TC_BINDING_SAMPLERVIEW_CS, 280 TC_BINDING_SSBO_VS, 281 TC_BINDING_SSBO_FS, 282 TC_BINDING_SSBO_GS, 283 TC_BINDING_SSBO_TCS, 284 TC_BINDING_SSBO_TES, 285 TC_BINDING_SSBO_CS, 286 TC_BINDING_IMAGE_VS, 287 TC_BINDING_IMAGE_FS, 288 TC_BINDING_IMAGE_GS, 289 TC_BINDING_IMAGE_TCS, 290 TC_BINDING_IMAGE_TES, 291 TC_BINDING_IMAGE_CS, 292}; 293 294typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx, 295 struct pipe_resource *dst, 296 struct pipe_resource *src, 297 unsigned minimum_num_rebinds, 298 uint32_t rebind_mask, 299 uint32_t delete_buffer_id); 300typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx, 301 struct tc_unflushed_batch_token *token); 302typedef bool (*tc_is_resource_busy)(struct pipe_screen *screen, 303 struct pipe_resource *resource, 304 unsigned usage); 305 306struct threaded_resource { 307 struct pipe_resource b; 308 309 /* Since buffer invalidations are queued, we can't use the base resource 310 * for unsychronized mappings. This points to the latest version of 311 * the buffer after the latest invalidation. It's only used for unsychro- 312 * nized mappings in the non-driver thread. Initially it's set to &b. 313 */ 314 struct pipe_resource *latest; 315 316 /* The buffer range which is initialized (with a write transfer, streamout, 317 * or writable shader resources). The remainder of the buffer is considered 318 * invalid and can be mapped unsynchronized. 319 * 320 * This allows unsychronized mapping of a buffer range which hasn't been 321 * used yet. It's for applications which forget to use the unsynchronized 322 * map flag and expect the driver to figure it out. 323 * 324 * Drivers should set this to the full range for buffers backed by user 325 * memory. 326 */ 327 struct util_range valid_buffer_range; 328 329 /* Drivers are required to update this for shared resources and user 330 * pointers. */ 331 bool is_shared; 332 bool is_user_ptr; 333 334 /* Unique buffer ID. Drivers must set it to non-zero for buffers and it must 335 * be unique. Textures must set 0. Low bits are used as a hash of the ID. 336 * Use util_idalloc_mt to generate these IDs. 337 */ 338 uint32_t buffer_id_unique; 339 340 /* If positive, then a staging transfer is in progress. 341 */ 342 int pending_staging_uploads; 343 344 /* If staging uploads are pending, this will hold the union of the mapped 345 * ranges. 346 */ 347 struct util_range pending_staging_uploads_range; 348}; 349 350struct threaded_transfer { 351 struct pipe_transfer b; 352 353 /* Staging buffer for DISCARD_RANGE transfers. */ 354 struct pipe_resource *staging; 355 356 /* If b.resource is not the base instance of the buffer, but it's one of its 357 * reallocations (set in "latest" of the base instance), this points to 358 * the valid range of the base instance. It's used for transfers after 359 * a buffer invalidation, because such transfers operate on "latest", not 360 * the base instance. Initially it's set to &b.resource->valid_buffer_range. 361 */ 362 struct util_range *valid_buffer_range; 363}; 364 365struct threaded_query { 366 /* The query is added to the list in end_query and removed in flush. */ 367 struct list_head head_unflushed; 368 369 /* Whether pipe->flush has been called in non-deferred mode after end_query. */ 370 bool flushed; 371}; 372 373struct tc_call_base { 374#if !defined(NDEBUG) && TC_DEBUG >= 1 375 uint32_t sentinel; 376#endif 377 ushort num_slots; 378 ushort call_id; 379}; 380 381/** 382 * A token representing an unflushed batch. 383 * 384 * See the general rules for fences for an explanation. 385 */ 386struct tc_unflushed_batch_token { 387 struct pipe_reference ref; 388 struct threaded_context *tc; 389}; 390 391struct tc_batch { 392 struct threaded_context *tc; 393#if !defined(NDEBUG) && TC_DEBUG >= 1 394 unsigned sentinel; 395#endif 396 uint16_t num_total_slots; 397 uint16_t buffer_list_index; 398 struct util_queue_fence fence; 399 struct tc_unflushed_batch_token *token; 400 uint64_t slots[TC_SLOTS_PER_BATCH]; 401}; 402 403struct tc_buffer_list { 404 /* Signalled by the driver after it flushes its internal command buffer. */ 405 struct util_queue_fence driver_flushed_fence; 406 407 /* Buffer list where bit N means whether ID hash N is in the list. */ 408 BITSET_DECLARE(buffer_list, TC_BUFFER_ID_MASK + 1); 409}; 410 411/** 412 * Optional TC parameters/callbacks. 413 */ 414struct threaded_context_options { 415 tc_create_fence_func create_fence; 416 tc_is_resource_busy is_resource_busy; 417 bool driver_calls_flush_notify; 418 419 /** 420 * If true, ctx->get_device_reset_status() will be called without 421 * synchronizing with driver thread. Drivers can enable this to avoid 422 * TC syncs if their implementation of get_device_reset_status() is 423 * safe to call without synchronizing with driver thread. 424 */ 425 bool unsynchronized_get_device_reset_status; 426}; 427 428struct threaded_context { 429 struct pipe_context base; 430 struct pipe_context *pipe; 431 struct slab_child_pool pool_transfers; 432 tc_replace_buffer_storage_func replace_buffer_storage; 433 struct threaded_context_options options; 434 unsigned map_buffer_alignment; 435 unsigned ubo_alignment; 436 437 struct list_head unflushed_queries; 438 439 /* Counters for the HUD. */ 440 unsigned num_offloaded_slots; 441 unsigned num_direct_slots; 442 unsigned num_syncs; 443 444 bool use_forced_staging_uploads; 445 bool add_all_gfx_bindings_to_buffer_list; 446 bool add_all_compute_bindings_to_buffer_list; 447 448 /* Estimation of how much vram/gtt bytes are mmap'd in 449 * the current tc_batch. 450 */ 451 uint64_t bytes_mapped_estimate; 452 uint64_t bytes_mapped_limit; 453 454 struct util_queue queue; 455 struct util_queue_fence *fence; 456 457#ifndef NDEBUG 458 /** 459 * The driver thread is normally the queue thread, but 460 * there are cases where the queue is flushed directly 461 * from the frontend thread 462 */ 463 thread_id driver_thread; 464#endif 465 466 bool seen_tcs; 467 bool seen_tes; 468 bool seen_gs; 469 470 bool seen_streamout_buffers; 471 bool seen_shader_buffers[PIPE_SHADER_TYPES]; 472 bool seen_image_buffers[PIPE_SHADER_TYPES]; 473 bool seen_sampler_buffers[PIPE_SHADER_TYPES]; 474 475 unsigned max_vertex_buffers; 476 unsigned max_const_buffers; 477 unsigned max_shader_buffers; 478 unsigned max_images; 479 unsigned max_samplers; 480 481 unsigned last, next, next_buf_list; 482 483 /* The list fences that the driver should signal after the next flush. 484 * If this is empty, all driver command buffers have been flushed. 485 */ 486 struct util_queue_fence *signal_fences_next_flush[TC_MAX_BUFFER_LISTS]; 487 unsigned num_signal_fences_next_flush; 488 489 /* Bound buffers are tracked here using threaded_resource::buffer_id_hash. 490 * 0 means unbound. 491 */ 492 uint32_t vertex_buffers[PIPE_MAX_ATTRIBS]; 493 uint32_t streamout_buffers[PIPE_MAX_SO_BUFFERS]; 494 uint32_t const_buffers[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS]; 495 uint32_t shader_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS]; 496 uint32_t image_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES]; 497 uint32_t shader_buffers_writeable_mask[PIPE_SHADER_TYPES]; 498 uint32_t image_buffers_writeable_mask[PIPE_SHADER_TYPES]; 499 /* Don't use PIPE_MAX_SHADER_SAMPLER_VIEWS because it's too large. */ 500 uint32_t sampler_buffers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; 501 502 struct tc_batch batch_slots[TC_MAX_BATCHES]; 503 struct tc_buffer_list buffer_lists[TC_MAX_BUFFER_LISTS]; 504}; 505 506void threaded_resource_init(struct pipe_resource *res); 507void threaded_resource_deinit(struct pipe_resource *res); 508struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe); 509void tc_driver_internal_flush_notify(struct threaded_context *tc); 510 511struct pipe_context * 512threaded_context_create(struct pipe_context *pipe, 513 struct slab_parent_pool *parent_transfer_pool, 514 tc_replace_buffer_storage_func replace_buffer, 515 const struct threaded_context_options *options, 516 struct threaded_context **out); 517 518void 519threaded_context_init_bytes_mapped_limit(struct threaded_context *tc, unsigned divisor); 520 521void 522threaded_context_flush(struct pipe_context *_pipe, 523 struct tc_unflushed_batch_token *token, 524 bool prefer_async); 525 526void 527tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info, 528 unsigned drawid_offset, 529 const struct pipe_draw_indirect_info *indirect, 530 const struct pipe_draw_start_count_bias *draws, 531 unsigned num_draws); 532 533static inline struct threaded_context * 534threaded_context(struct pipe_context *pipe) 535{ 536 return (struct threaded_context*)pipe; 537} 538 539static inline struct threaded_resource * 540threaded_resource(struct pipe_resource *res) 541{ 542 return (struct threaded_resource*)res; 543} 544 545static inline struct threaded_query * 546threaded_query(struct pipe_query *q) 547{ 548 return (struct threaded_query*)q; 549} 550 551static inline struct threaded_transfer * 552threaded_transfer(struct pipe_transfer *transfer) 553{ 554 return (struct threaded_transfer*)transfer; 555} 556 557static inline void 558tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst, 559 struct tc_unflushed_batch_token *src) 560{ 561 if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src)) 562 free(*dst); 563 *dst = src; 564} 565 566/** 567 * Helper for !NDEBUG builds to assert that it is called from driver 568 * thread. This is to help drivers ensure that various code-paths 569 * are not hit indirectly from pipe entry points that are called from 570 * front-end/state-tracker thread. 571 */ 572static inline void 573tc_assert_driver_thread(struct threaded_context *tc) 574{ 575 if (!tc) 576 return; 577#ifndef NDEBUG 578 assert(util_thread_id_equal(tc->driver_thread, util_get_thread_id())); 579#endif 580} 581 582#endif 583