1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 * 24 */ 25 26/* This file implements tests on the si_clearbuffer function. */ 27 28#include "si_pipe.h" 29#include "si_query.h" 30 31#define MIN_SIZE 512 32#define MAX_SIZE (128 * 1024 * 1024) 33#define SIZE_SHIFT 1 34#define NUM_RUNS 128 35 36static double get_MBps_rate(unsigned num_bytes, unsigned ns) 37{ 38 return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0); 39} 40 41void si_test_dma_perf(struct si_screen *sscreen) 42{ 43 struct pipe_screen *screen = &sscreen->b; 44 struct pipe_context *ctx = screen->context_create(screen, NULL, 0); 45 struct si_context *sctx = (struct si_context *)ctx; 46 const uint32_t clear_value = 0x12345678; 47 static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1}; 48 static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16}; 49 50#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list) 51#define NUM_METHODS (3 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list)) 52 53 static const char *method_str[] = { 54 "CP MC ", 55 "CP L2 ", 56 "CP L2 ", 57 }; 58 static const char *placement_str[] = { 59 /* Clear */ 60 "fill->VRAM", 61 "fill->GTT ", 62 /* Copy */ 63 "VRAM->VRAM", 64 "VRAM->GTT ", 65 "GTT ->VRAM", 66 }; 67 68 printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n"); 69 printf("Heap ,Method ,L2p,Wa,"); 70 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { 71 if (size >= 1024) 72 printf("%6uKB,", size / 1024); 73 else 74 printf(" %6uB,", size); 75 } 76 printf("\n"); 77 78 /* results[log2(size)][placement][method][] */ 79 struct si_result { 80 bool is_valid; 81 bool is_cp; 82 bool is_cs; 83 unsigned cache_policy; 84 unsigned dwords_per_thread; 85 unsigned waves_per_sh; 86 unsigned score; 87 unsigned index; /* index in results[x][y][index] */ 88 } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {}; 89 90 /* Run benchmarks. */ 91 for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { 92 bool is_copy = placement >= 2; 93 94 printf("-----------,--------,---,--,"); 95 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) 96 printf("--------,"); 97 printf("\n"); 98 99 for (unsigned method = 0; method < NUM_METHODS; method++) { 100 bool test_cp = method <= 2; 101 bool test_cs = method >= 3; 102 unsigned cs_method = method - 3; 103 unsigned cs_waves_per_sh = 104 test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0; 105 cs_method %= 3 * NUM_SHADERS; 106 unsigned cache_policy = 107 test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0; 108 unsigned cs_dwords_per_thread = 109 test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0; 110 111 if (sctx->chip_class == GFX6) { 112 /* GFX6 doesn't support CP DMA operations through L2. */ 113 if (test_cp && cache_policy != L2_BYPASS) 114 continue; 115 /* WAVES_PER_SH is in multiples of 16 on GFX6. */ 116 if (test_cs && cs_waves_per_sh % 16 != 0) 117 continue; 118 } 119 120 /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect 121 * chips before gfx9. 122 */ 123 if (test_cs && cache_policy && sctx->chip_class < GFX9) 124 continue; 125 126 printf("%s ,", placement_str[placement]); 127 if (test_cs) { 128 printf("CS x%-4u,%3s,", cs_dwords_per_thread, 129 cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : ""); 130 } else { 131 printf("%s,%3s,", method_str[method], 132 method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : ""); 133 } 134 if (test_cs && cs_waves_per_sh) 135 printf("%2u,", cs_waves_per_sh); 136 else 137 printf(" ,"); 138 139 void *compute_shader = NULL; 140 if (test_cs) { 141 compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread, 142 cache_policy == L2_STREAM, is_copy); 143 } 144 145 double score = 0; 146 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { 147 /* Don't test bigger sizes if it's too slow. Print 0. */ 148 if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) { 149 printf("%7.0f ,", 0.0); 150 continue; 151 } 152 153 enum pipe_resource_usage dst_usage, src_usage; 154 struct pipe_resource *dst, *src; 155 unsigned query_type = PIPE_QUERY_TIME_ELAPSED; 156 unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0; 157 158 if (placement == 0 || placement == 2 || placement == 4) 159 dst_usage = PIPE_USAGE_DEFAULT; 160 else 161 dst_usage = PIPE_USAGE_STREAM; 162 163 if (placement == 2 || placement == 3) 164 src_usage = PIPE_USAGE_DEFAULT; 165 else 166 src_usage = PIPE_USAGE_STREAM; 167 168 dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256); 169 src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL; 170 171 /* Wait for idle before testing, so that other processes don't mess up the results. */ 172 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | 173 SI_CONTEXT_FLUSH_AND_INV_CB | 174 SI_CONTEXT_FLUSH_AND_INV_DB; 175 sctx->emit_cache_flush(sctx, &sctx->gfx_cs); 176 177 struct pipe_query *q = ctx->create_query(ctx, query_type, 0); 178 ctx->begin_query(ctx, q); 179 180 /* Run tests. */ 181 for (unsigned iter = 0; iter < NUM_RUNS; iter++) { 182 if (test_cp) { 183 /* CP DMA */ 184 if (is_copy) { 185 si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, SI_OP_SYNC_BEFORE_AFTER, 186 SI_COHERENCY_NONE, cache_policy); 187 } else { 188 si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, size, clear_value, 189 SI_OP_SYNC_BEFORE_AFTER, SI_COHERENCY_NONE, 190 cache_policy); 191 } 192 } else { 193 /* Compute */ 194 /* The memory accesses are coalesced, meaning that the 1st instruction writes 195 * the 1st contiguous block of data for the whole wave, the 2nd instruction 196 * writes the 2nd contiguous block of data, etc. 197 */ 198 unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4); 199 unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread; 200 unsigned dwords_per_wave = cs_dwords_per_thread * 64; 201 202 unsigned num_dwords = size / 4; 203 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); 204 205 struct pipe_grid_info info = {}; 206 info.block[0] = MIN2(64, num_instructions); 207 info.block[1] = 1; 208 info.block[2] = 1; 209 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); 210 info.grid[1] = 1; 211 info.grid[2] = 1; 212 213 struct pipe_shader_buffer sb[2] = {}; 214 sb[0].buffer = dst; 215 sb[0].buffer_size = size; 216 217 if (is_copy) { 218 sb[1].buffer = src; 219 sb[1].buffer_size = size; 220 } else { 221 for (unsigned i = 0; i < 4; i++) 222 sctx->cs_user_data[i] = clear_value; 223 } 224 225 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1); 226 ctx->bind_compute_state(ctx, compute_shader); 227 sctx->cs_max_waves_per_sh = cs_waves_per_sh; 228 229 ctx->launch_grid(ctx, &info); 230 231 ctx->bind_compute_state(ctx, NULL); 232 sctx->cs_max_waves_per_sh = 0; /* disable the limit */ 233 } 234 235 /* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */ 236 sctx->flags |= SI_CONTEXT_INV_VCACHE | 237 (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) | 238 SI_CONTEXT_CS_PARTIAL_FLUSH; 239 sctx->emit_cache_flush(sctx, &sctx->gfx_cs); 240 } 241 242 ctx->end_query(ctx, q); 243 ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC); 244 245 pipe_resource_reference(&dst, NULL); 246 pipe_resource_reference(&src, NULL); 247 248 /* Get results. */ 249 250 union pipe_query_result result; 251 252 ctx->get_query_result(ctx, q, true, &result); 253 ctx->destroy_query(ctx, q); 254 255 score = get_MBps_rate(size, result.u64 / (double)NUM_RUNS); 256 printf("%7.0f ,", score); 257 fflush(stdout); 258 259 struct si_result *r = &results[util_logbase2(size)][placement][method]; 260 r->is_valid = true; 261 r->is_cp = test_cp; 262 r->is_cs = test_cs; 263 r->cache_policy = cache_policy; 264 r->dwords_per_thread = cs_dwords_per_thread; 265 r->waves_per_sh = cs_waves_per_sh; 266 r->score = score; 267 r->index = method; 268 } 269 puts(""); 270 271 if (compute_shader) 272 ctx->delete_compute_state(ctx, compute_shader); 273 } 274 } 275 276 puts(""); 277 puts("static struct si_method"); 278 printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool " 279 "cached)\n", 280 sctx->screen->info.name); 281 puts("{"); 282 puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); 283 284 /* Analyze results and find the best methods. */ 285 for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) { 286 if (placement == 0) 287 puts(" if (dst == RADEON_DOMAIN_VRAM) {"); 288 else if (placement == 1) 289 puts(" } else { /* GTT */"); 290 else if (placement == 2) { 291 puts("}"); 292 puts(""); 293 puts("static struct si_method"); 294 printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n", 295 sctx->screen->info.name); 296 printf(" uint64_t size64, bool async, bool cached)\n"); 297 puts("{"); 298 puts(" unsigned size = MIN2(size64, UINT_MAX);\n"); 299 puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {"); 300 } else if (placement == 3) 301 puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {"); 302 else 303 puts(" } else { /* GTT -> VRAM */"); 304 305 for (unsigned mode = 0; mode < 3; mode++) { 306 bool async = mode == 0; 307 bool cached = mode == 1; 308 309 if (async) 310 puts(" if (async) { /* async compute */"); 311 else if (cached) 312 puts(" if (cached) { /* gfx ring */"); 313 else 314 puts(" } else { /* gfx ring - uncached */"); 315 316 /* The list of best chosen methods. */ 317 struct si_result *methods[32]; 318 unsigned method_max_size[32]; 319 unsigned num_methods = 0; 320 321 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) { 322 /* Find the best method. */ 323 struct si_result *best = NULL; 324 325 for (unsigned i = 0; i < NUM_METHODS; i++) { 326 struct si_result *r = &results[util_logbase2(size)][placement][i]; 327 328 if (!r->is_valid) 329 continue; 330 331 /* Ban CP DMA clears via MC on <= GFX8. They are super slow 332 * on GTT, which we can get due to BO evictions. 333 */ 334 if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp && 335 r->cache_policy == L2_BYPASS) 336 continue; 337 338 if (async) { 339 /* The following constraints for compute IBs try to limit 340 * resource usage so as not to decrease the performance 341 * of gfx IBs too much. 342 */ 343 344 /* Don't use CP DMA on asynchronous rings, because 345 * the engine is shared with gfx IBs. 346 */ 347 if (r->is_cp) 348 continue; 349 350 /* Don't use L2 caching on asynchronous rings to minimize 351 * L2 usage. 352 */ 353 if (r->cache_policy == L2_LRU) 354 continue; 355 356 /* Asynchronous compute recommends waves_per_sh != 0 357 * to limit CU usage. */ 358 if (r->is_cs && r->waves_per_sh == 0) 359 continue; 360 } else { 361 if (cached && r->cache_policy == L2_BYPASS) 362 continue; 363 if (!cached && r->cache_policy == L2_LRU) 364 continue; 365 } 366 367 if (!best) { 368 best = r; 369 continue; 370 } 371 372 /* Assume some measurement error. Earlier methods occupy fewer 373 * resources, so the next method is always more greedy, and we 374 * don't want to select it due to a measurement error. 375 */ 376 double min_improvement = 1.03; 377 378 if (best->score * min_improvement < r->score) 379 best = r; 380 } 381 382 if (num_methods > 0) { 383 unsigned prev_index = num_methods - 1; 384 struct si_result *prev = methods[prev_index]; 385 struct si_result *prev_this_size = 386 &results[util_logbase2(size)][placement][prev->index]; 387 388 /* If the best one is also the best for the previous size, 389 * just bump the size for the previous one. 390 * 391 * If there is no best, it means all methods were too slow 392 * for this size and were not tested. Use the best one for 393 * the previous size. 394 */ 395 if (!best || 396 /* If it's the same method as for the previous size: */ 397 (prev->is_cp == best->is_cp && 398 prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy && 399 prev->dwords_per_thread == best->dwords_per_thread && 400 prev->waves_per_sh == best->waves_per_sh) || 401 /* If the method for the previous size is also the best 402 * for this size: */ 403 (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) { 404 method_max_size[prev_index] = size; 405 continue; 406 } 407 } 408 409 /* Add it to the list. */ 410 assert(num_methods < ARRAY_SIZE(methods)); 411 methods[num_methods] = best; 412 method_max_size[num_methods] = size; 413 num_methods++; 414 } 415 416 for (unsigned i = 0; i < num_methods; i++) { 417 struct si_result *best = methods[i]; 418 unsigned size = method_max_size[i]; 419 420 /* The size threshold is between the current benchmarked 421 * size and the next benchmarked size. */ 422 if (i < num_methods - 1) 423 printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2); 424 else if (i > 0) 425 printf(" else "); 426 else 427 printf(" "); 428 printf("return "); 429 430 assert(best); 431 const char *cache_policy_str = 432 best->cache_policy == L2_BYPASS ? "L2_BYPASS" : 433 best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM"; 434 435 if (best->is_cp) { 436 printf("CP_DMA(%s);\n", cache_policy_str); 437 } 438 if (best->is_cs) { 439 printf("COMPUTE(%s, %u, %u);\n", cache_policy_str, 440 best->dwords_per_thread, best->waves_per_sh); 441 } 442 } 443 } 444 puts(" }"); 445 } 446 puts(" }"); 447 puts("}"); 448 449 ctx->destroy(ctx); 450 exit(0); 451} 452