1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28#include "brw_fs.h" 29#include "brw_fs_live_variables.h" 30#include "brw_vec4.h" 31#include "brw_cfg.h" 32#include "brw_shader.h" 33 34using namespace brw; 35 36/** @file brw_fs_schedule_instructions.cpp 37 * 38 * List scheduling of FS instructions. 39 * 40 * The basic model of the list scheduler is to take a basic block, 41 * compute a DAG of the dependencies (RAW ordering with latency, WAW 42 * ordering with latency, WAR ordering), and make a list of the DAG heads. 43 * Heuristically pick a DAG head, then put all the children that are 44 * now DAG heads into the list of things to schedule. 45 * 46 * The heuristic is the important part. We're trying to be cheap, 47 * since actually computing the optimal scheduling is NP complete. 48 * What we do is track a "current clock". When we schedule a node, we 49 * update the earliest-unblocked clock time of its children, and 50 * increment the clock. Then, when trying to schedule, we just pick 51 * the earliest-unblocked instruction to schedule. 52 * 53 * Note that often there will be many things which could execute 54 * immediately, and there are a range of heuristic options to choose 55 * from in picking among those. 56 */ 57 58static bool debug = false; 59 60class instruction_scheduler; 61 62class schedule_node : public exec_node 63{ 64public: 65 schedule_node(backend_instruction *inst, instruction_scheduler *sched); 66 void set_latency_gen4(); 67 void set_latency_gen7(bool is_haswell); 68 69 backend_instruction *inst; 70 schedule_node **children; 71 int *child_latency; 72 int child_count; 73 int parent_count; 74 int child_array_size; 75 int unblocked_time; 76 int latency; 77 78 /** 79 * Which iteration of pushing groups of children onto the candidates list 80 * this node was a part of. 81 */ 82 unsigned cand_generation; 83 84 /** 85 * This is the sum of the instruction's latency plus the maximum delay of 86 * its children, or just the issue_time if it's a leaf node. 87 */ 88 int delay; 89 90 /** 91 * Preferred exit node among the (direct or indirect) successors of this 92 * node. Among the scheduler nodes blocked by this node, this will be the 93 * one that may cause earliest program termination, or NULL if none of the 94 * successors is an exit node. 95 */ 96 schedule_node *exit; 97}; 98 99/** 100 * Lower bound of the scheduling time after which one of the instructions 101 * blocked by this node may lead to program termination. 102 * 103 * exit_unblocked_time() determines a strict partial ordering relation '«' on 104 * the set of scheduler nodes as follows: 105 * 106 * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m) 107 * 108 * which can be used to heuristically order nodes according to how early they 109 * can unblock an exit node and lead to program termination. 110 */ 111static inline int 112exit_unblocked_time(const schedule_node *n) 113{ 114 return n->exit ? n->exit->unblocked_time : INT_MAX; 115} 116 117void 118schedule_node::set_latency_gen4() 119{ 120 int chans = 8; 121 int math_latency = 22; 122 123 switch (inst->opcode) { 124 case SHADER_OPCODE_RCP: 125 this->latency = 1 * chans * math_latency; 126 break; 127 case SHADER_OPCODE_RSQ: 128 this->latency = 2 * chans * math_latency; 129 break; 130 case SHADER_OPCODE_INT_QUOTIENT: 131 case SHADER_OPCODE_SQRT: 132 case SHADER_OPCODE_LOG2: 133 /* full precision log. partial is 2. */ 134 this->latency = 3 * chans * math_latency; 135 break; 136 case SHADER_OPCODE_INT_REMAINDER: 137 case SHADER_OPCODE_EXP2: 138 /* full precision. partial is 3, same throughput. */ 139 this->latency = 4 * chans * math_latency; 140 break; 141 case SHADER_OPCODE_POW: 142 this->latency = 8 * chans * math_latency; 143 break; 144 case SHADER_OPCODE_SIN: 145 case SHADER_OPCODE_COS: 146 /* minimum latency, max is 12 rounds. */ 147 this->latency = 5 * chans * math_latency; 148 break; 149 default: 150 this->latency = 2; 151 break; 152 } 153} 154 155void 156schedule_node::set_latency_gen7(bool is_haswell) 157{ 158 switch (inst->opcode) { 159 case BRW_OPCODE_MAD: 160 /* 2 cycles 161 * (since the last two src operands are in different register banks): 162 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 163 * 164 * 3 cycles on IVB, 4 on HSW 165 * (since the last two src operands are in the same register bank): 166 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 167 * 168 * 18 cycles on IVB, 16 on HSW 169 * (since the last two src operands are in different register banks): 170 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 171 * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q }; 172 * 173 * 20 cycles on IVB, 18 on HSW 174 * (since the last two src operands are in the same register bank): 175 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 176 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 177 */ 178 179 /* Our register allocator doesn't know about register banks, so use the 180 * higher latency. 181 */ 182 latency = is_haswell ? 16 : 18; 183 break; 184 185 case BRW_OPCODE_LRP: 186 /* 2 cycles 187 * (since the last two src operands are in different register banks): 188 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 189 * 190 * 3 cycles on IVB, 4 on HSW 191 * (since the last two src operands are in the same register bank): 192 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 193 * 194 * 16 cycles on IVB, 14 on HSW 195 * (since the last two src operands are in different register banks): 196 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 197 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 198 * 199 * 16 cycles 200 * (since the last two src operands are in the same register bank): 201 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 202 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 203 */ 204 205 /* Our register allocator doesn't know about register banks, so use the 206 * higher latency. 207 */ 208 latency = 14; 209 break; 210 211 case SHADER_OPCODE_RCP: 212 case SHADER_OPCODE_RSQ: 213 case SHADER_OPCODE_SQRT: 214 case SHADER_OPCODE_LOG2: 215 case SHADER_OPCODE_EXP2: 216 case SHADER_OPCODE_SIN: 217 case SHADER_OPCODE_COS: 218 /* 2 cycles: 219 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 220 * 221 * 18 cycles: 222 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 223 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 224 * 225 * Same for exp2, log2, rsq, sqrt, sin, cos. 226 */ 227 latency = is_haswell ? 14 : 16; 228 break; 229 230 case SHADER_OPCODE_POW: 231 /* 2 cycles: 232 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 233 * 234 * 26 cycles: 235 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 236 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 237 */ 238 latency = is_haswell ? 22 : 24; 239 break; 240 241 case SHADER_OPCODE_TEX: 242 case SHADER_OPCODE_TXD: 243 case SHADER_OPCODE_TXF: 244 case SHADER_OPCODE_TXF_LZ: 245 case SHADER_OPCODE_TXL: 246 case SHADER_OPCODE_TXL_LZ: 247 /* 18 cycles: 248 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 249 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 250 * send(8) g4<1>UW g114<8,8,1>F 251 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 252 * 253 * 697 +/-49 cycles (min 610, n=26): 254 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 255 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 256 * send(8) g4<1>UW g114<8,8,1>F 257 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 258 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 259 * 260 * So the latency on our first texture load of the batchbuffer takes 261 * ~700 cycles, since the caches are cold at that point. 262 * 263 * 840 +/- 92 cycles (min 720, n=25): 264 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 265 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 266 * send(8) g4<1>UW g114<8,8,1>F 267 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 268 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 269 * send(8) g4<1>UW g114<8,8,1>F 270 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 271 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 272 * 273 * On the second load, it takes just an extra ~140 cycles, and after 274 * accounting for the 14 cycles of the MOV's latency, that makes ~130. 275 * 276 * 683 +/- 49 cycles (min = 602, n=47): 277 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 278 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 279 * send(8) g4<1>UW g114<8,8,1>F 280 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 281 * send(8) g50<1>UW g114<8,8,1>F 282 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 283 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 284 * 285 * The unit appears to be pipelined, since this matches up with the 286 * cache-cold case, despite there being two loads here. If you replace 287 * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). 288 * 289 * So, take some number between the cache-hot 140 cycles and the 290 * cache-cold 700 cycles. No particular tuning was done on this. 291 * 292 * I haven't done significant testing of the non-TEX opcodes. TXL at 293 * least looked about the same as TEX. 294 */ 295 latency = 200; 296 break; 297 298 case SHADER_OPCODE_TXS: 299 /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 300 * cycles (n=15): 301 * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; 302 * send(8) g6<1>UW g114<8,8,1>F 303 * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; 304 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; 305 * 306 * 307 * Two loads was 535 +/- 30 cycles (n=19): 308 * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 309 * send(16) g6<1>UW g114<8,8,1>F 310 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 311 * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 312 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; 313 * send(16) g8<1>UW g114<8,8,1>F 314 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 315 * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; 316 * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; 317 * 318 * Since the only caches that should matter are just the 319 * instruction/state cache containing the surface state, assume that we 320 * always have hot caches. 321 */ 322 latency = 100; 323 break; 324 325 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: 326 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 327 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: 328 case VS_OPCODE_PULL_CONSTANT_LOAD: 329 /* testing using varying-index pull constants: 330 * 331 * 16 cycles: 332 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 333 * send(8) g4<1>F g4<8,8,1>D 334 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 335 * 336 * ~480 cycles: 337 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 338 * send(8) g4<1>F g4<8,8,1>D 339 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 340 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 341 * 342 * ~620 cycles: 343 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 344 * send(8) g4<1>F g4<8,8,1>D 345 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 346 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 347 * send(8) g4<1>F g4<8,8,1>D 348 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 349 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 350 * 351 * So, if it's cache-hot, it's about 140. If it's cache cold, it's 352 * about 460. We expect to mostly be cache hot, so pick something more 353 * in that direction. 354 */ 355 latency = 200; 356 break; 357 358 case SHADER_OPCODE_GEN7_SCRATCH_READ: 359 /* Testing a load from offset 0, that had been previously written: 360 * 361 * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q }; 362 * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q }; 363 * 364 * The cycles spent seemed to be grouped around 40-50 (as low as 38), 365 * then around 140. Presumably this is cache hit vs miss. 366 */ 367 latency = 50; 368 break; 369 370 case VEC4_OPCODE_UNTYPED_ATOMIC: 371 /* See GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 372 latency = 14000; 373 break; 374 375 case VEC4_OPCODE_UNTYPED_SURFACE_READ: 376 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: 377 /* See also GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 378 latency = is_haswell ? 300 : 600; 379 break; 380 381 case SHADER_OPCODE_SEND: 382 switch (inst->sfid) { 383 case BRW_SFID_SAMPLER: { 384 unsigned msg_type = (inst->desc >> 12) & 0x1f; 385 switch (msg_type) { 386 case GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO: 387 case GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO: 388 /* See also SHADER_OPCODE_TXS */ 389 latency = 100; 390 break; 391 392 default: 393 /* See also SHADER_OPCODE_TEX */ 394 latency = 200; 395 break; 396 } 397 break; 398 } 399 400 case GEN6_SFID_DATAPORT_RENDER_CACHE: 401 switch ((inst->desc >> 14) & 0x1f) { 402 case GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE: 403 case GEN7_DATAPORT_RC_TYPED_SURFACE_READ: 404 /* See also SHADER_OPCODE_TYPED_SURFACE_READ */ 405 assert(!is_haswell); 406 latency = 600; 407 break; 408 409 case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP: 410 /* See also SHADER_OPCODE_TYPED_ATOMIC */ 411 assert(!is_haswell); 412 latency = 14000; 413 break; 414 415 default: 416 unreachable("Unknown render cache message"); 417 } 418 break; 419 420 case GEN7_SFID_DATAPORT_DATA_CACHE: 421 switch ((inst->desc >> 14) & 0x1f) { 422 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ: 423 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE: 424 /* We have no data for this but assume it's roughly the same as 425 * untyped surface read/write. 426 */ 427 latency = 300; 428 break; 429 430 case GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ: 431 case GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE: 432 /* Test code: 433 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 434 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 435 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 436 * send(8) g4<1>UD g112<8,8,1>UD 437 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 438 * . 439 * . [repeats 8 times] 440 * . 441 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 442 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 443 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 444 * send(8) g4<1>UD g112<8,8,1>UD 445 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 446 * 447 * Running it 100 times as fragment shader on a 128x128 quad 448 * gives an average latency of 583 cycles per surface read, 449 * standard deviation 0.9%. 450 */ 451 assert(!is_haswell); 452 latency = 600; 453 break; 454 455 case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP: 456 /* Test code: 457 * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; 458 * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; 459 * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; 460 * send(8) g4<1>ud g112<8,8,1>ud 461 * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; 462 * 463 * Running it 100 times as fragment shader on a 128x128 quad 464 * gives an average latency of 13867 cycles per atomic op, 465 * standard deviation 3%. Note that this is a rather 466 * pessimistic estimate, the actual latency in cases with few 467 * collisions between threads and favorable pipelining has been 468 * seen to be reduced by a factor of 100. 469 */ 470 assert(!is_haswell); 471 latency = 14000; 472 break; 473 474 default: 475 unreachable("Unknown data cache message"); 476 } 477 break; 478 479 case HSW_SFID_DATAPORT_DATA_CACHE_1: 480 switch ((inst->desc >> 14) & 0x1f) { 481 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ: 482 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: 483 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: 484 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: 485 case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE: 486 case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: 487 case GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE: 488 case GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ: 489 /* See also GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 490 latency = 300; 491 break; 492 493 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: 494 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: 495 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: 496 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: 497 case GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP: 498 case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP: 499 case GEN9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP: 500 /* See also GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 501 latency = 14000; 502 break; 503 504 default: 505 unreachable("Unknown data cache message"); 506 } 507 break; 508 509 default: 510 unreachable("Unknown SFID"); 511 } 512 break; 513 514 default: 515 /* 2 cycles: 516 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 517 * 518 * 16 cycles: 519 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 520 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 521 */ 522 latency = 14; 523 break; 524 } 525} 526 527class instruction_scheduler { 528public: 529 instruction_scheduler(backend_shader *s, int grf_count, 530 unsigned hw_reg_count, int block_count, 531 instruction_scheduler_mode mode) 532 { 533 this->bs = s; 534 this->mem_ctx = ralloc_context(NULL); 535 this->grf_count = grf_count; 536 this->hw_reg_count = hw_reg_count; 537 this->instructions.make_empty(); 538 this->instructions_to_schedule = 0; 539 this->post_reg_alloc = (mode == SCHEDULE_POST); 540 this->mode = mode; 541 if (!post_reg_alloc) { 542 this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); 543 544 this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 545 for (int i = 0; i < block_count; i++) 546 this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, 547 BITSET_WORDS(grf_count)); 548 549 this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 550 for (int i = 0; i < block_count; i++) 551 this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 552 BITSET_WORDS(grf_count)); 553 554 this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 555 for (int i = 0; i < block_count; i++) 556 this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 557 BITSET_WORDS(hw_reg_count)); 558 559 this->written = rzalloc_array(mem_ctx, bool, grf_count); 560 561 this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); 562 563 this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count); 564 } else { 565 this->reg_pressure_in = NULL; 566 this->livein = NULL; 567 this->liveout = NULL; 568 this->hw_liveout = NULL; 569 this->written = NULL; 570 this->reads_remaining = NULL; 571 this->hw_reads_remaining = NULL; 572 } 573 } 574 575 ~instruction_scheduler() 576 { 577 ralloc_free(this->mem_ctx); 578 } 579 void add_barrier_deps(schedule_node *n); 580 void add_dep(schedule_node *before, schedule_node *after, int latency); 581 void add_dep(schedule_node *before, schedule_node *after); 582 583 void run(cfg_t *cfg); 584 void add_insts_from_block(bblock_t *block); 585 void compute_delays(); 586 void compute_exits(); 587 virtual void calculate_deps() = 0; 588 virtual schedule_node *choose_instruction_to_schedule() = 0; 589 590 /** 591 * Returns how many cycles it takes the instruction to issue. 592 * 593 * Instructions in gen hardware are handled one simd4 vector at a time, 594 * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2 595 * cycles to dispatch and SIMD16 (compressed) instructions take 4. 596 */ 597 virtual int issue_time(backend_instruction *inst) = 0; 598 599 virtual void count_reads_remaining(backend_instruction *inst) = 0; 600 virtual void setup_liveness(cfg_t *cfg) = 0; 601 virtual void update_register_pressure(backend_instruction *inst) = 0; 602 virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; 603 604 void schedule_instructions(bblock_t *block); 605 606 void *mem_ctx; 607 608 bool post_reg_alloc; 609 int instructions_to_schedule; 610 int grf_count; 611 unsigned hw_reg_count; 612 int reg_pressure; 613 int block_idx; 614 exec_list instructions; 615 backend_shader *bs; 616 617 instruction_scheduler_mode mode; 618 619 /* 620 * The register pressure at the beginning of each basic block. 621 */ 622 623 int *reg_pressure_in; 624 625 /* 626 * The virtual GRF's whose range overlaps the beginning of each basic block. 627 */ 628 629 BITSET_WORD **livein; 630 631 /* 632 * The virtual GRF's whose range overlaps the end of each basic block. 633 */ 634 635 BITSET_WORD **liveout; 636 637 /* 638 * The hardware GRF's whose range overlaps the end of each basic block. 639 */ 640 641 BITSET_WORD **hw_liveout; 642 643 /* 644 * Whether we've scheduled a write for this virtual GRF yet. 645 */ 646 647 bool *written; 648 649 /* 650 * How many reads we haven't scheduled for this virtual GRF yet. 651 */ 652 653 int *reads_remaining; 654 655 /* 656 * How many reads we haven't scheduled for this hardware GRF yet. 657 */ 658 659 int *hw_reads_remaining; 660}; 661 662class fs_instruction_scheduler : public instruction_scheduler 663{ 664public: 665 fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count, 666 int block_count, 667 instruction_scheduler_mode mode); 668 void calculate_deps(); 669 bool is_compressed(fs_inst *inst); 670 schedule_node *choose_instruction_to_schedule(); 671 int issue_time(backend_instruction *inst); 672 fs_visitor *v; 673 674 void count_reads_remaining(backend_instruction *inst); 675 void setup_liveness(cfg_t *cfg); 676 void update_register_pressure(backend_instruction *inst); 677 int get_register_pressure_benefit(backend_instruction *inst); 678}; 679 680fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v, 681 int grf_count, int hw_reg_count, 682 int block_count, 683 instruction_scheduler_mode mode) 684 : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), 685 v(v) 686{ 687} 688 689static bool 690is_src_duplicate(fs_inst *inst, int src) 691{ 692 for (int i = 0; i < src; i++) 693 if (inst->src[i].equals(inst->src[src])) 694 return true; 695 696 return false; 697} 698 699void 700fs_instruction_scheduler::count_reads_remaining(backend_instruction *be) 701{ 702 fs_inst *inst = (fs_inst *)be; 703 704 if (!reads_remaining) 705 return; 706 707 for (int i = 0; i < inst->sources; i++) { 708 if (is_src_duplicate(inst, i)) 709 continue; 710 711 if (inst->src[i].file == VGRF) { 712 reads_remaining[inst->src[i].nr]++; 713 } else if (inst->src[i].file == FIXED_GRF) { 714 if (inst->src[i].nr >= hw_reg_count) 715 continue; 716 717 for (unsigned j = 0; j < regs_read(inst, i); j++) 718 hw_reads_remaining[inst->src[i].nr + j]++; 719 } 720 } 721} 722 723void 724fs_instruction_scheduler::setup_liveness(cfg_t *cfg) 725{ 726 /* First, compute liveness on a per-GRF level using the in/out sets from 727 * liveness calculation. 728 */ 729 for (int block = 0; block < cfg->num_blocks; block++) { 730 for (int i = 0; i < v->live_intervals->num_vars; i++) { 731 if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) { 732 int vgrf = v->live_intervals->vgrf_from_var[i]; 733 if (!BITSET_TEST(livein[block], vgrf)) { 734 reg_pressure_in[block] += v->alloc.sizes[vgrf]; 735 BITSET_SET(livein[block], vgrf); 736 } 737 } 738 739 if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i)) 740 BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]); 741 } 742 } 743 744 /* Now, extend the live in/live out sets for when a range crosses a block 745 * boundary, which matches what our register allocator/interference code 746 * does to account for force_writemask_all and incompatible exec_mask's. 747 */ 748 for (int block = 0; block < cfg->num_blocks - 1; block++) { 749 for (int i = 0; i < grf_count; i++) { 750 if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip && 751 v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) { 752 if (!BITSET_TEST(livein[block + 1], i)) { 753 reg_pressure_in[block + 1] += v->alloc.sizes[i]; 754 BITSET_SET(livein[block + 1], i); 755 } 756 757 BITSET_SET(liveout[block], i); 758 } 759 } 760 } 761 762 int payload_last_use_ip[hw_reg_count]; 763 v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); 764 765 for (unsigned i = 0; i < hw_reg_count; i++) { 766 if (payload_last_use_ip[i] == -1) 767 continue; 768 769 for (int block = 0; block < cfg->num_blocks; block++) { 770 if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) 771 reg_pressure_in[block]++; 772 773 if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) 774 BITSET_SET(hw_liveout[block], i); 775 } 776 } 777} 778 779void 780fs_instruction_scheduler::update_register_pressure(backend_instruction *be) 781{ 782 fs_inst *inst = (fs_inst *)be; 783 784 if (!reads_remaining) 785 return; 786 787 if (inst->dst.file == VGRF) { 788 written[inst->dst.nr] = true; 789 } 790 791 for (int i = 0; i < inst->sources; i++) { 792 if (is_src_duplicate(inst, i)) 793 continue; 794 795 if (inst->src[i].file == VGRF) { 796 reads_remaining[inst->src[i].nr]--; 797 } else if (inst->src[i].file == FIXED_GRF && 798 inst->src[i].nr < hw_reg_count) { 799 for (unsigned off = 0; off < regs_read(inst, i); off++) 800 hw_reads_remaining[inst->src[i].nr + off]--; 801 } 802 } 803} 804 805int 806fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) 807{ 808 fs_inst *inst = (fs_inst *)be; 809 int benefit = 0; 810 811 if (inst->dst.file == VGRF) { 812 if (!BITSET_TEST(livein[block_idx], inst->dst.nr) && 813 !written[inst->dst.nr]) 814 benefit -= v->alloc.sizes[inst->dst.nr]; 815 } 816 817 for (int i = 0; i < inst->sources; i++) { 818 if (is_src_duplicate(inst, i)) 819 continue; 820 821 if (inst->src[i].file == VGRF && 822 !BITSET_TEST(liveout[block_idx], inst->src[i].nr) && 823 reads_remaining[inst->src[i].nr] == 1) 824 benefit += v->alloc.sizes[inst->src[i].nr]; 825 826 if (inst->src[i].file == FIXED_GRF && 827 inst->src[i].nr < hw_reg_count) { 828 for (unsigned off = 0; off < regs_read(inst, i); off++) { 829 int reg = inst->src[i].nr + off; 830 if (!BITSET_TEST(hw_liveout[block_idx], reg) && 831 hw_reads_remaining[reg] == 1) { 832 benefit++; 833 } 834 } 835 } 836 } 837 838 return benefit; 839} 840 841class vec4_instruction_scheduler : public instruction_scheduler 842{ 843public: 844 vec4_instruction_scheduler(vec4_visitor *v, int grf_count); 845 void calculate_deps(); 846 schedule_node *choose_instruction_to_schedule(); 847 int issue_time(backend_instruction *inst); 848 vec4_visitor *v; 849 850 void count_reads_remaining(backend_instruction *inst); 851 void setup_liveness(cfg_t *cfg); 852 void update_register_pressure(backend_instruction *inst); 853 int get_register_pressure_benefit(backend_instruction *inst); 854}; 855 856vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v, 857 int grf_count) 858 : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), 859 v(v) 860{ 861} 862 863void 864vec4_instruction_scheduler::count_reads_remaining(backend_instruction *) 865{ 866} 867 868void 869vec4_instruction_scheduler::setup_liveness(cfg_t *) 870{ 871} 872 873void 874vec4_instruction_scheduler::update_register_pressure(backend_instruction *) 875{ 876} 877 878int 879vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *) 880{ 881 return 0; 882} 883 884schedule_node::schedule_node(backend_instruction *inst, 885 instruction_scheduler *sched) 886{ 887 const struct gen_device_info *devinfo = sched->bs->devinfo; 888 889 this->inst = inst; 890 this->child_array_size = 0; 891 this->children = NULL; 892 this->child_latency = NULL; 893 this->child_count = 0; 894 this->parent_count = 0; 895 this->unblocked_time = 0; 896 this->cand_generation = 0; 897 this->delay = 0; 898 this->exit = NULL; 899 900 /* We can't measure Gen6 timings directly but expect them to be much 901 * closer to Gen7 than Gen4. 902 */ 903 if (!sched->post_reg_alloc) 904 this->latency = 1; 905 else if (devinfo->gen >= 6) 906 set_latency_gen7(devinfo->is_haswell); 907 else 908 set_latency_gen4(); 909} 910 911void 912instruction_scheduler::add_insts_from_block(bblock_t *block) 913{ 914 foreach_inst_in_block(backend_instruction, inst, block) { 915 schedule_node *n = new(mem_ctx) schedule_node(inst, this); 916 917 instructions.push_tail(n); 918 } 919 920 this->instructions_to_schedule = block->end_ip - block->start_ip + 1; 921} 922 923/** Computation of the delay member of each node. */ 924void 925instruction_scheduler::compute_delays() 926{ 927 foreach_in_list_reverse(schedule_node, n, &instructions) { 928 if (!n->child_count) { 929 n->delay = issue_time(n->inst); 930 } else { 931 for (int i = 0; i < n->child_count; i++) { 932 assert(n->children[i]->delay); 933 n->delay = MAX2(n->delay, n->latency + n->children[i]->delay); 934 } 935 } 936 } 937} 938 939void 940instruction_scheduler::compute_exits() 941{ 942 /* Calculate a lower bound of the scheduling time of each node in the 943 * graph. This is analogous to the node's critical path but calculated 944 * from the top instead of from the bottom of the block. 945 */ 946 foreach_in_list(schedule_node, n, &instructions) { 947 for (int i = 0; i < n->child_count; i++) { 948 n->children[i]->unblocked_time = 949 MAX2(n->children[i]->unblocked_time, 950 n->unblocked_time + issue_time(n->inst) + n->child_latency[i]); 951 } 952 } 953 954 /* Calculate the exit of each node by induction based on the exit nodes of 955 * its children. The preferred exit of a node is the one among the exit 956 * nodes of its children which can be unblocked first according to the 957 * optimistic unblocked time estimate calculated above. 958 */ 959 foreach_in_list_reverse(schedule_node, n, &instructions) { 960 n->exit = (n->inst->opcode == FS_OPCODE_DISCARD_JUMP ? n : NULL); 961 962 for (int i = 0; i < n->child_count; i++) { 963 if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n)) 964 n->exit = n->children[i]->exit; 965 } 966 } 967} 968 969/** 970 * Add a dependency between two instruction nodes. 971 * 972 * The @after node will be scheduled after @before. We will try to 973 * schedule it @latency cycles after @before, but no guarantees there. 974 */ 975void 976instruction_scheduler::add_dep(schedule_node *before, schedule_node *after, 977 int latency) 978{ 979 if (!before || !after) 980 return; 981 982 assert(before != after); 983 984 for (int i = 0; i < before->child_count; i++) { 985 if (before->children[i] == after) { 986 before->child_latency[i] = MAX2(before->child_latency[i], latency); 987 return; 988 } 989 } 990 991 if (before->child_array_size <= before->child_count) { 992 if (before->child_array_size < 16) 993 before->child_array_size = 16; 994 else 995 before->child_array_size *= 2; 996 997 before->children = reralloc(mem_ctx, before->children, 998 schedule_node *, 999 before->child_array_size); 1000 before->child_latency = reralloc(mem_ctx, before->child_latency, 1001 int, before->child_array_size); 1002 } 1003 1004 before->children[before->child_count] = after; 1005 before->child_latency[before->child_count] = latency; 1006 before->child_count++; 1007 after->parent_count++; 1008} 1009 1010void 1011instruction_scheduler::add_dep(schedule_node *before, schedule_node *after) 1012{ 1013 if (!before) 1014 return; 1015 1016 add_dep(before, after, before->latency); 1017} 1018 1019static bool 1020is_scheduling_barrier(const backend_instruction *inst) 1021{ 1022 return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT || 1023 inst->is_control_flow() || 1024 inst->has_side_effects(); 1025} 1026 1027/** 1028 * Sometimes we really want this node to execute after everything that 1029 * was before it and before everything that followed it. This adds 1030 * the deps to do so. 1031 */ 1032void 1033instruction_scheduler::add_barrier_deps(schedule_node *n) 1034{ 1035 schedule_node *prev = (schedule_node *)n->prev; 1036 schedule_node *next = (schedule_node *)n->next; 1037 1038 if (prev) { 1039 while (!prev->is_head_sentinel()) { 1040 add_dep(prev, n, 0); 1041 if (is_scheduling_barrier(prev->inst)) 1042 break; 1043 prev = (schedule_node *)prev->prev; 1044 } 1045 } 1046 1047 if (next) { 1048 while (!next->is_tail_sentinel()) { 1049 add_dep(n, next, 0); 1050 if (is_scheduling_barrier(next->inst)) 1051 break; 1052 next = (schedule_node *)next->next; 1053 } 1054 } 1055} 1056 1057/* instruction scheduling needs to be aware of when an MRF write 1058 * actually writes 2 MRFs. 1059 */ 1060bool 1061fs_instruction_scheduler::is_compressed(fs_inst *inst) 1062{ 1063 return inst->exec_size == 16; 1064} 1065 1066void 1067fs_instruction_scheduler::calculate_deps() 1068{ 1069 /* Pre-register-allocation, this tracks the last write per VGRF offset. 1070 * After register allocation, reg_offsets are gone and we track individual 1071 * GRF registers. 1072 */ 1073 schedule_node **last_grf_write; 1074 schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)]; 1075 schedule_node *last_conditional_mod[8] = {}; 1076 schedule_node *last_accumulator_write = NULL; 1077 /* Fixed HW registers are assumed to be separate from the virtual 1078 * GRFs, so they can be tracked separately. We don't really write 1079 * to fixed GRFs much, so don't bother tracking them on a more 1080 * granular level. 1081 */ 1082 schedule_node *last_fixed_grf_write = NULL; 1083 1084 last_grf_write = (schedule_node **)calloc(sizeof(schedule_node *), grf_count * 16); 1085 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1086 1087 /* top-to-bottom dependencies: RAW and WAW. */ 1088 foreach_in_list(schedule_node, n, &instructions) { 1089 fs_inst *inst = (fs_inst *)n->inst; 1090 1091 if (is_scheduling_barrier(inst)) 1092 add_barrier_deps(n); 1093 1094 /* read-after-write deps. */ 1095 for (int i = 0; i < inst->sources; i++) { 1096 if (inst->src[i].file == VGRF) { 1097 if (post_reg_alloc) { 1098 for (unsigned r = 0; r < regs_read(inst, i); r++) 1099 add_dep(last_grf_write[inst->src[i].nr + r], n); 1100 } else { 1101 for (unsigned r = 0; r < regs_read(inst, i); r++) { 1102 add_dep(last_grf_write[inst->src[i].nr * 16 + 1103 inst->src[i].offset / REG_SIZE + r], n); 1104 } 1105 } 1106 } else if (inst->src[i].file == FIXED_GRF) { 1107 if (post_reg_alloc) { 1108 for (unsigned r = 0; r < regs_read(inst, i); r++) 1109 add_dep(last_grf_write[inst->src[i].nr + r], n); 1110 } else { 1111 add_dep(last_fixed_grf_write, n); 1112 } 1113 } else if (inst->src[i].is_accumulator()) { 1114 add_dep(last_accumulator_write, n); 1115 } else if (inst->src[i].file == ARF) { 1116 add_barrier_deps(n); 1117 } 1118 } 1119 1120 if (inst->base_mrf != -1) { 1121 for (int i = 0; i < inst->mlen; i++) { 1122 /* It looks like the MRF regs are released in the send 1123 * instruction once it's sent, not when the result comes 1124 * back. 1125 */ 1126 add_dep(last_mrf_write[inst->base_mrf + i], n); 1127 } 1128 } 1129 1130 if (const unsigned mask = inst->flags_read(v->devinfo)) { 1131 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1132 1133 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1134 if (mask & (1 << i)) 1135 add_dep(last_conditional_mod[i], n); 1136 } 1137 } 1138 1139 if (inst->reads_accumulator_implicitly()) { 1140 add_dep(last_accumulator_write, n); 1141 } 1142 1143 /* write-after-write deps. */ 1144 if (inst->dst.file == VGRF) { 1145 if (post_reg_alloc) { 1146 for (unsigned r = 0; r < regs_written(inst); r++) { 1147 add_dep(last_grf_write[inst->dst.nr + r], n); 1148 last_grf_write[inst->dst.nr + r] = n; 1149 } 1150 } else { 1151 for (unsigned r = 0; r < regs_written(inst); r++) { 1152 add_dep(last_grf_write[inst->dst.nr * 16 + 1153 inst->dst.offset / REG_SIZE + r], n); 1154 last_grf_write[inst->dst.nr * 16 + 1155 inst->dst.offset / REG_SIZE + r] = n; 1156 } 1157 } 1158 } else if (inst->dst.file == MRF) { 1159 int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1160 1161 add_dep(last_mrf_write[reg], n); 1162 last_mrf_write[reg] = n; 1163 if (is_compressed(inst)) { 1164 if (inst->dst.nr & BRW_MRF_COMPR4) 1165 reg += 4; 1166 else 1167 reg++; 1168 add_dep(last_mrf_write[reg], n); 1169 last_mrf_write[reg] = n; 1170 } 1171 } else if (inst->dst.file == FIXED_GRF) { 1172 if (post_reg_alloc) { 1173 for (unsigned r = 0; r < regs_written(inst); r++) 1174 last_grf_write[inst->dst.nr + r] = n; 1175 } else { 1176 last_fixed_grf_write = n; 1177 } 1178 } else if (inst->dst.is_accumulator()) { 1179 add_dep(last_accumulator_write, n); 1180 last_accumulator_write = n; 1181 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1182 add_barrier_deps(n); 1183 } 1184 1185 if (inst->mlen > 0 && inst->base_mrf != -1) { 1186 for (int i = 0; i < v->implied_mrf_writes(inst); i++) { 1187 add_dep(last_mrf_write[inst->base_mrf + i], n); 1188 last_mrf_write[inst->base_mrf + i] = n; 1189 } 1190 } 1191 1192 if (const unsigned mask = inst->flags_written()) { 1193 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1194 1195 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1196 if (mask & (1 << i)) { 1197 add_dep(last_conditional_mod[i], n, 0); 1198 last_conditional_mod[i] = n; 1199 } 1200 } 1201 } 1202 1203 if (inst->writes_accumulator_implicitly(v->devinfo) && 1204 !inst->dst.is_accumulator()) { 1205 add_dep(last_accumulator_write, n); 1206 last_accumulator_write = n; 1207 } 1208 } 1209 1210 /* bottom-to-top dependencies: WAR */ 1211 memset(last_grf_write, 0, sizeof(schedule_node *) * grf_count * 16); 1212 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1213 memset(last_conditional_mod, 0, sizeof(last_conditional_mod)); 1214 last_accumulator_write = NULL; 1215 last_fixed_grf_write = NULL; 1216 1217 foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1218 fs_inst *inst = (fs_inst *)n->inst; 1219 1220 /* write-after-read deps. */ 1221 for (int i = 0; i < inst->sources; i++) { 1222 if (inst->src[i].file == VGRF) { 1223 if (post_reg_alloc) { 1224 for (unsigned r = 0; r < regs_read(inst, i); r++) 1225 add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1226 } else { 1227 for (unsigned r = 0; r < regs_read(inst, i); r++) { 1228 add_dep(n, last_grf_write[inst->src[i].nr * 16 + 1229 inst->src[i].offset / REG_SIZE + r], 0); 1230 } 1231 } 1232 } else if (inst->src[i].file == FIXED_GRF) { 1233 if (post_reg_alloc) { 1234 for (unsigned r = 0; r < regs_read(inst, i); r++) 1235 add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1236 } else { 1237 add_dep(n, last_fixed_grf_write, 0); 1238 } 1239 } else if (inst->src[i].is_accumulator()) { 1240 add_dep(n, last_accumulator_write, 0); 1241 } else if (inst->src[i].file == ARF) { 1242 add_barrier_deps(n); 1243 } 1244 } 1245 1246 if (inst->base_mrf != -1) { 1247 for (int i = 0; i < inst->mlen; i++) { 1248 /* It looks like the MRF regs are released in the send 1249 * instruction once it's sent, not when the result comes 1250 * back. 1251 */ 1252 add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1253 } 1254 } 1255 1256 if (const unsigned mask = inst->flags_read(v->devinfo)) { 1257 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1258 1259 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1260 if (mask & (1 << i)) 1261 add_dep(n, last_conditional_mod[i]); 1262 } 1263 } 1264 1265 if (inst->reads_accumulator_implicitly()) { 1266 add_dep(n, last_accumulator_write); 1267 } 1268 1269 /* Update the things this instruction wrote, so earlier reads 1270 * can mark this as WAR dependency. 1271 */ 1272 if (inst->dst.file == VGRF) { 1273 if (post_reg_alloc) { 1274 for (unsigned r = 0; r < regs_written(inst); r++) 1275 last_grf_write[inst->dst.nr + r] = n; 1276 } else { 1277 for (unsigned r = 0; r < regs_written(inst); r++) { 1278 last_grf_write[inst->dst.nr * 16 + 1279 inst->dst.offset / REG_SIZE + r] = n; 1280 } 1281 } 1282 } else if (inst->dst.file == MRF) { 1283 int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1284 1285 last_mrf_write[reg] = n; 1286 1287 if (is_compressed(inst)) { 1288 if (inst->dst.nr & BRW_MRF_COMPR4) 1289 reg += 4; 1290 else 1291 reg++; 1292 1293 last_mrf_write[reg] = n; 1294 } 1295 } else if (inst->dst.file == FIXED_GRF) { 1296 if (post_reg_alloc) { 1297 for (unsigned r = 0; r < regs_written(inst); r++) 1298 last_grf_write[inst->dst.nr + r] = n; 1299 } else { 1300 last_fixed_grf_write = n; 1301 } 1302 } else if (inst->dst.is_accumulator()) { 1303 last_accumulator_write = n; 1304 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1305 add_barrier_deps(n); 1306 } 1307 1308 if (inst->mlen > 0 && inst->base_mrf != -1) { 1309 for (int i = 0; i < v->implied_mrf_writes(inst); i++) { 1310 last_mrf_write[inst->base_mrf + i] = n; 1311 } 1312 } 1313 1314 if (const unsigned mask = inst->flags_written()) { 1315 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1316 1317 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1318 if (mask & (1 << i)) 1319 last_conditional_mod[i] = n; 1320 } 1321 } 1322 1323 if (inst->writes_accumulator_implicitly(v->devinfo)) { 1324 last_accumulator_write = n; 1325 } 1326 } 1327 1328 free(last_grf_write); 1329} 1330 1331void 1332vec4_instruction_scheduler::calculate_deps() 1333{ 1334 schedule_node *last_grf_write[grf_count]; 1335 schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)]; 1336 schedule_node *last_conditional_mod = NULL; 1337 schedule_node *last_accumulator_write = NULL; 1338 /* Fixed HW registers are assumed to be separate from the virtual 1339 * GRFs, so they can be tracked separately. We don't really write 1340 * to fixed GRFs much, so don't bother tracking them on a more 1341 * granular level. 1342 */ 1343 schedule_node *last_fixed_grf_write = NULL; 1344 1345 memset(last_grf_write, 0, sizeof(last_grf_write)); 1346 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1347 1348 /* top-to-bottom dependencies: RAW and WAW. */ 1349 foreach_in_list(schedule_node, n, &instructions) { 1350 vec4_instruction *inst = (vec4_instruction *)n->inst; 1351 1352 if (is_scheduling_barrier(inst)) 1353 add_barrier_deps(n); 1354 1355 /* read-after-write deps. */ 1356 for (int i = 0; i < 3; i++) { 1357 if (inst->src[i].file == VGRF) { 1358 for (unsigned j = 0; j < regs_read(inst, i); ++j) 1359 add_dep(last_grf_write[inst->src[i].nr + j], n); 1360 } else if (inst->src[i].file == FIXED_GRF) { 1361 add_dep(last_fixed_grf_write, n); 1362 } else if (inst->src[i].is_accumulator()) { 1363 assert(last_accumulator_write); 1364 add_dep(last_accumulator_write, n); 1365 } else if (inst->src[i].file == ARF) { 1366 add_barrier_deps(n); 1367 } 1368 } 1369 1370 if (inst->reads_g0_implicitly()) 1371 add_dep(last_fixed_grf_write, n); 1372 1373 if (!inst->is_send_from_grf()) { 1374 for (int i = 0; i < inst->mlen; i++) { 1375 /* It looks like the MRF regs are released in the send 1376 * instruction once it's sent, not when the result comes 1377 * back. 1378 */ 1379 add_dep(last_mrf_write[inst->base_mrf + i], n); 1380 } 1381 } 1382 1383 if (inst->reads_flag()) { 1384 assert(last_conditional_mod); 1385 add_dep(last_conditional_mod, n); 1386 } 1387 1388 if (inst->reads_accumulator_implicitly()) { 1389 assert(last_accumulator_write); 1390 add_dep(last_accumulator_write, n); 1391 } 1392 1393 /* write-after-write deps. */ 1394 if (inst->dst.file == VGRF) { 1395 for (unsigned j = 0; j < regs_written(inst); ++j) { 1396 add_dep(last_grf_write[inst->dst.nr + j], n); 1397 last_grf_write[inst->dst.nr + j] = n; 1398 } 1399 } else if (inst->dst.file == MRF) { 1400 add_dep(last_mrf_write[inst->dst.nr], n); 1401 last_mrf_write[inst->dst.nr] = n; 1402 } else if (inst->dst.file == FIXED_GRF) { 1403 last_fixed_grf_write = n; 1404 } else if (inst->dst.is_accumulator()) { 1405 add_dep(last_accumulator_write, n); 1406 last_accumulator_write = n; 1407 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1408 add_barrier_deps(n); 1409 } 1410 1411 if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1412 for (int i = 0; i < v->implied_mrf_writes(inst); i++) { 1413 add_dep(last_mrf_write[inst->base_mrf + i], n); 1414 last_mrf_write[inst->base_mrf + i] = n; 1415 } 1416 } 1417 1418 if (inst->writes_flag()) { 1419 add_dep(last_conditional_mod, n, 0); 1420 last_conditional_mod = n; 1421 } 1422 1423 if (inst->writes_accumulator_implicitly(v->devinfo) && 1424 !inst->dst.is_accumulator()) { 1425 add_dep(last_accumulator_write, n); 1426 last_accumulator_write = n; 1427 } 1428 } 1429 1430 /* bottom-to-top dependencies: WAR */ 1431 memset(last_grf_write, 0, sizeof(last_grf_write)); 1432 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1433 last_conditional_mod = NULL; 1434 last_accumulator_write = NULL; 1435 last_fixed_grf_write = NULL; 1436 1437 foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1438 vec4_instruction *inst = (vec4_instruction *)n->inst; 1439 1440 /* write-after-read deps. */ 1441 for (int i = 0; i < 3; i++) { 1442 if (inst->src[i].file == VGRF) { 1443 for (unsigned j = 0; j < regs_read(inst, i); ++j) 1444 add_dep(n, last_grf_write[inst->src[i].nr + j]); 1445 } else if (inst->src[i].file == FIXED_GRF) { 1446 add_dep(n, last_fixed_grf_write); 1447 } else if (inst->src[i].is_accumulator()) { 1448 add_dep(n, last_accumulator_write); 1449 } else if (inst->src[i].file == ARF) { 1450 add_barrier_deps(n); 1451 } 1452 } 1453 1454 if (!inst->is_send_from_grf()) { 1455 for (int i = 0; i < inst->mlen; i++) { 1456 /* It looks like the MRF regs are released in the send 1457 * instruction once it's sent, not when the result comes 1458 * back. 1459 */ 1460 add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1461 } 1462 } 1463 1464 if (inst->reads_flag()) { 1465 add_dep(n, last_conditional_mod); 1466 } 1467 1468 if (inst->reads_accumulator_implicitly()) { 1469 add_dep(n, last_accumulator_write); 1470 } 1471 1472 /* Update the things this instruction wrote, so earlier reads 1473 * can mark this as WAR dependency. 1474 */ 1475 if (inst->dst.file == VGRF) { 1476 for (unsigned j = 0; j < regs_written(inst); ++j) 1477 last_grf_write[inst->dst.nr + j] = n; 1478 } else if (inst->dst.file == MRF) { 1479 last_mrf_write[inst->dst.nr] = n; 1480 } else if (inst->dst.file == FIXED_GRF) { 1481 last_fixed_grf_write = n; 1482 } else if (inst->dst.is_accumulator()) { 1483 last_accumulator_write = n; 1484 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1485 add_barrier_deps(n); 1486 } 1487 1488 if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1489 for (int i = 0; i < v->implied_mrf_writes(inst); i++) { 1490 last_mrf_write[inst->base_mrf + i] = n; 1491 } 1492 } 1493 1494 if (inst->writes_flag()) { 1495 last_conditional_mod = n; 1496 } 1497 1498 if (inst->writes_accumulator_implicitly(v->devinfo)) { 1499 last_accumulator_write = n; 1500 } 1501 } 1502} 1503 1504schedule_node * 1505fs_instruction_scheduler::choose_instruction_to_schedule() 1506{ 1507 schedule_node *chosen = NULL; 1508 1509 if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) { 1510 int chosen_time = 0; 1511 1512 /* Of the instructions ready to execute or the closest to being ready, 1513 * choose the one most likely to unblock an early program exit, or 1514 * otherwise the oldest one. 1515 */ 1516 foreach_in_list(schedule_node, n, &instructions) { 1517 if (!chosen || 1518 exit_unblocked_time(n) < exit_unblocked_time(chosen) || 1519 (exit_unblocked_time(n) == exit_unblocked_time(chosen) && 1520 n->unblocked_time < chosen_time)) { 1521 chosen = n; 1522 chosen_time = n->unblocked_time; 1523 } 1524 } 1525 } else { 1526 /* Before register allocation, we don't care about the latencies of 1527 * instructions. All we care about is reducing live intervals of 1528 * variables so that we can avoid register spilling, or get SIMD16 1529 * shaders which naturally do a better job of hiding instruction 1530 * latency. 1531 */ 1532 foreach_in_list(schedule_node, n, &instructions) { 1533 fs_inst *inst = (fs_inst *)n->inst; 1534 1535 if (!chosen) { 1536 chosen = n; 1537 continue; 1538 } 1539 1540 /* Most important: If we can definitely reduce register pressure, do 1541 * so immediately. 1542 */ 1543 int register_pressure_benefit = get_register_pressure_benefit(n->inst); 1544 int chosen_register_pressure_benefit = 1545 get_register_pressure_benefit(chosen->inst); 1546 1547 if (register_pressure_benefit > 0 && 1548 register_pressure_benefit > chosen_register_pressure_benefit) { 1549 chosen = n; 1550 continue; 1551 } else if (chosen_register_pressure_benefit > 0 && 1552 (register_pressure_benefit < 1553 chosen_register_pressure_benefit)) { 1554 continue; 1555 } 1556 1557 if (mode == SCHEDULE_PRE_LIFO) { 1558 /* Prefer instructions that recently became available for 1559 * scheduling. These are the things that are most likely to 1560 * (eventually) make a variable dead and reduce register pressure. 1561 * Typical register pressure estimates don't work for us because 1562 * most of our pressure comes from texturing, where no single 1563 * instruction to schedule will make a vec4 value dead. 1564 */ 1565 if (n->cand_generation > chosen->cand_generation) { 1566 chosen = n; 1567 continue; 1568 } else if (n->cand_generation < chosen->cand_generation) { 1569 continue; 1570 } 1571 1572 /* On MRF-using chips, prefer non-SEND instructions. If we don't 1573 * do this, then because we prefer instructions that just became 1574 * candidates, we'll end up in a pattern of scheduling a SEND, 1575 * then the MRFs for the next SEND, then the next SEND, then the 1576 * MRFs, etc., without ever consuming the results of a send. 1577 */ 1578 if (v->devinfo->gen < 7) { 1579 fs_inst *chosen_inst = (fs_inst *)chosen->inst; 1580 1581 /* We use size_written > 4 * exec_size as our test for the kind 1582 * of send instruction to avoid -- only sends generate many 1583 * regs, and a single-result send is probably actually reducing 1584 * register pressure. 1585 */ 1586 if (inst->size_written <= 4 * inst->exec_size && 1587 chosen_inst->size_written > 4 * chosen_inst->exec_size) { 1588 chosen = n; 1589 continue; 1590 } else if (inst->size_written > chosen_inst->size_written) { 1591 continue; 1592 } 1593 } 1594 } 1595 1596 /* For instructions pushed on the cands list at the same time, prefer 1597 * the one with the highest delay to the end of the program. This is 1598 * most likely to have its values able to be consumed first (such as 1599 * for a large tree of lowered ubo loads, which appear reversed in 1600 * the instruction stream with respect to when they can be consumed). 1601 */ 1602 if (n->delay > chosen->delay) { 1603 chosen = n; 1604 continue; 1605 } else if (n->delay < chosen->delay) { 1606 continue; 1607 } 1608 1609 /* Prefer the node most likely to unblock an early program exit. 1610 */ 1611 if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) { 1612 chosen = n; 1613 continue; 1614 } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) { 1615 continue; 1616 } 1617 1618 /* If all other metrics are equal, we prefer the first instruction in 1619 * the list (program execution). 1620 */ 1621 } 1622 } 1623 1624 return chosen; 1625} 1626 1627schedule_node * 1628vec4_instruction_scheduler::choose_instruction_to_schedule() 1629{ 1630 schedule_node *chosen = NULL; 1631 int chosen_time = 0; 1632 1633 /* Of the instructions ready to execute or the closest to being ready, 1634 * choose the oldest one. 1635 */ 1636 foreach_in_list(schedule_node, n, &instructions) { 1637 if (!chosen || n->unblocked_time < chosen_time) { 1638 chosen = n; 1639 chosen_time = n->unblocked_time; 1640 } 1641 } 1642 1643 return chosen; 1644} 1645 1646int 1647fs_instruction_scheduler::issue_time(backend_instruction *inst) 1648{ 1649 const unsigned overhead = v->bank_conflict_cycles((fs_inst *)inst); 1650 if (is_compressed((fs_inst *)inst)) 1651 return 4 + overhead; 1652 else 1653 return 2 + overhead; 1654} 1655 1656int 1657vec4_instruction_scheduler::issue_time(backend_instruction *) 1658{ 1659 /* We always execute as two vec4s in parallel. */ 1660 return 2; 1661} 1662 1663void 1664instruction_scheduler::schedule_instructions(bblock_t *block) 1665{ 1666 const struct gen_device_info *devinfo = bs->devinfo; 1667 int time = 0; 1668 if (!post_reg_alloc) 1669 reg_pressure = reg_pressure_in[block->num]; 1670 block_idx = block->num; 1671 1672 /* Remove non-DAG heads from the list. */ 1673 foreach_in_list_safe(schedule_node, n, &instructions) { 1674 if (n->parent_count != 0) 1675 n->remove(); 1676 } 1677 1678 unsigned cand_generation = 1; 1679 while (!instructions.is_empty()) { 1680 schedule_node *chosen = choose_instruction_to_schedule(); 1681 1682 /* Schedule this instruction. */ 1683 assert(chosen); 1684 chosen->remove(); 1685 chosen->inst->exec_node::remove(); 1686 block->instructions.push_tail(chosen->inst); 1687 instructions_to_schedule--; 1688 1689 if (!post_reg_alloc) { 1690 reg_pressure -= get_register_pressure_benefit(chosen->inst); 1691 update_register_pressure(chosen->inst); 1692 } 1693 1694 /* If we expected a delay for scheduling, then bump the clock to reflect 1695 * that. In reality, the hardware will switch to another hyperthread 1696 * and may not return to dispatching our thread for a while even after 1697 * we're unblocked. After this, we have the time when the chosen 1698 * instruction will start executing. 1699 */ 1700 time = MAX2(time, chosen->unblocked_time); 1701 1702 /* Update the clock for how soon an instruction could start after the 1703 * chosen one. 1704 */ 1705 time += issue_time(chosen->inst); 1706 1707 if (debug) { 1708 fprintf(stderr, "clock %4d, scheduled: ", time); 1709 bs->dump_instruction(chosen->inst); 1710 if (!post_reg_alloc) 1711 fprintf(stderr, "(register pressure %d)\n", reg_pressure); 1712 } 1713 1714 /* Now that we've scheduled a new instruction, some of its 1715 * children can be promoted to the list of instructions ready to 1716 * be scheduled. Update the children's unblocked time for this 1717 * DAG edge as we do so. 1718 */ 1719 for (int i = chosen->child_count - 1; i >= 0; i--) { 1720 schedule_node *child = chosen->children[i]; 1721 1722 child->unblocked_time = MAX2(child->unblocked_time, 1723 time + chosen->child_latency[i]); 1724 1725 if (debug) { 1726 fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count); 1727 bs->dump_instruction(child->inst); 1728 } 1729 1730 child->cand_generation = cand_generation; 1731 child->parent_count--; 1732 if (child->parent_count == 0) { 1733 if (debug) { 1734 fprintf(stderr, "\t\tnow available\n"); 1735 } 1736 instructions.push_head(child); 1737 } 1738 } 1739 cand_generation++; 1740 1741 /* Shared resource: the mathbox. There's one mathbox per EU on Gen6+ 1742 * but it's more limited pre-gen6, so if we send something off to it then 1743 * the next math instruction isn't going to make progress until the first 1744 * is done. 1745 */ 1746 if (devinfo->gen < 6 && chosen->inst->is_math()) { 1747 foreach_in_list(schedule_node, n, &instructions) { 1748 if (n->inst->is_math()) 1749 n->unblocked_time = MAX2(n->unblocked_time, 1750 time + chosen->latency); 1751 } 1752 } 1753 } 1754 1755 assert(instructions_to_schedule == 0); 1756 1757 block->cycle_count = time; 1758} 1759 1760static unsigned get_cycle_count(cfg_t *cfg) 1761{ 1762 unsigned count = 0, multiplier = 1; 1763 foreach_block(block, cfg) { 1764 if (block->start()->opcode == BRW_OPCODE_DO) 1765 multiplier *= 10; /* assume that loops execute ~10 times */ 1766 1767 count += block->cycle_count * multiplier; 1768 1769 if (block->end()->opcode == BRW_OPCODE_WHILE) 1770 multiplier /= 10; 1771 } 1772 1773 return count; 1774} 1775 1776void 1777instruction_scheduler::run(cfg_t *cfg) 1778{ 1779 if (debug && !post_reg_alloc) { 1780 fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", 1781 post_reg_alloc); 1782 bs->dump_instructions(); 1783 } 1784 1785 if (!post_reg_alloc) 1786 setup_liveness(cfg); 1787 1788 foreach_block(block, cfg) { 1789 if (reads_remaining) { 1790 memset(reads_remaining, 0, 1791 grf_count * sizeof(*reads_remaining)); 1792 memset(hw_reads_remaining, 0, 1793 hw_reg_count * sizeof(*hw_reads_remaining)); 1794 memset(written, 0, grf_count * sizeof(*written)); 1795 1796 foreach_inst_in_block(fs_inst, inst, block) 1797 count_reads_remaining(inst); 1798 } 1799 1800 add_insts_from_block(block); 1801 1802 calculate_deps(); 1803 1804 compute_delays(); 1805 compute_exits(); 1806 1807 schedule_instructions(block); 1808 } 1809 1810 if (debug && !post_reg_alloc) { 1811 fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", 1812 post_reg_alloc); 1813 bs->dump_instructions(); 1814 } 1815 1816 cfg->cycle_count = get_cycle_count(cfg); 1817} 1818 1819void 1820fs_visitor::schedule_instructions(instruction_scheduler_mode mode) 1821{ 1822 if (mode != SCHEDULE_POST) 1823 calculate_live_intervals(); 1824 1825 int grf_count; 1826 if (mode == SCHEDULE_POST) 1827 grf_count = grf_used; 1828 else 1829 grf_count = alloc.count; 1830 1831 fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, 1832 cfg->num_blocks, mode); 1833 sched.run(cfg); 1834 1835 invalidate_live_intervals(); 1836} 1837 1838void 1839vec4_visitor::opt_schedule_instructions() 1840{ 1841 vec4_instruction_scheduler sched(this, prog_data->total_grf); 1842 sched.run(cfg); 1843 1844 invalidate_live_intervals(); 1845} 1846