1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28#include "brw_eu.h" 29#include "brw_fs.h" 30#include "brw_fs_live_variables.h" 31#include "brw_vec4.h" 32#include "brw_cfg.h" 33#include "brw_shader.h" 34 35using namespace brw; 36 37/** @file brw_fs_schedule_instructions.cpp 38 * 39 * List scheduling of FS instructions. 40 * 41 * The basic model of the list scheduler is to take a basic block, 42 * compute a DAG of the dependencies (RAW ordering with latency, WAW 43 * ordering with latency, WAR ordering), and make a list of the DAG heads. 44 * Heuristically pick a DAG head, then put all the children that are 45 * now DAG heads into the list of things to schedule. 46 * 47 * The heuristic is the important part. We're trying to be cheap, 48 * since actually computing the optimal scheduling is NP complete. 49 * What we do is track a "current clock". When we schedule a node, we 50 * update the earliest-unblocked clock time of its children, and 51 * increment the clock. Then, when trying to schedule, we just pick 52 * the earliest-unblocked instruction to schedule. 53 * 54 * Note that often there will be many things which could execute 55 * immediately, and there are a range of heuristic options to choose 56 * from in picking among those. 57 */ 58 59static bool debug = false; 60 61class instruction_scheduler; 62 63class schedule_node : public exec_node 64{ 65public: 66 schedule_node(backend_instruction *inst, instruction_scheduler *sched); 67 void set_latency_gfx4(); 68 void set_latency_gfx7(bool is_haswell); 69 70 const struct intel_device_info *devinfo; 71 backend_instruction *inst; 72 schedule_node **children; 73 int *child_latency; 74 int child_count; 75 int parent_count; 76 int child_array_size; 77 int unblocked_time; 78 int latency; 79 80 /** 81 * Which iteration of pushing groups of children onto the candidates list 82 * this node was a part of. 83 */ 84 unsigned cand_generation; 85 86 /** 87 * This is the sum of the instruction's latency plus the maximum delay of 88 * its children, or just the issue_time if it's a leaf node. 89 */ 90 int delay; 91 92 /** 93 * Preferred exit node among the (direct or indirect) successors of this 94 * node. Among the scheduler nodes blocked by this node, this will be the 95 * one that may cause earliest program termination, or NULL if none of the 96 * successors is an exit node. 97 */ 98 schedule_node *exit; 99}; 100 101/** 102 * Lower bound of the scheduling time after which one of the instructions 103 * blocked by this node may lead to program termination. 104 * 105 * exit_unblocked_time() determines a strict partial ordering relation '«' on 106 * the set of scheduler nodes as follows: 107 * 108 * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m) 109 * 110 * which can be used to heuristically order nodes according to how early they 111 * can unblock an exit node and lead to program termination. 112 */ 113static inline int 114exit_unblocked_time(const schedule_node *n) 115{ 116 return n->exit ? n->exit->unblocked_time : INT_MAX; 117} 118 119void 120schedule_node::set_latency_gfx4() 121{ 122 int chans = 8; 123 int math_latency = 22; 124 125 switch (inst->opcode) { 126 case SHADER_OPCODE_RCP: 127 this->latency = 1 * chans * math_latency; 128 break; 129 case SHADER_OPCODE_RSQ: 130 this->latency = 2 * chans * math_latency; 131 break; 132 case SHADER_OPCODE_INT_QUOTIENT: 133 case SHADER_OPCODE_SQRT: 134 case SHADER_OPCODE_LOG2: 135 /* full precision log. partial is 2. */ 136 this->latency = 3 * chans * math_latency; 137 break; 138 case SHADER_OPCODE_INT_REMAINDER: 139 case SHADER_OPCODE_EXP2: 140 /* full precision. partial is 3, same throughput. */ 141 this->latency = 4 * chans * math_latency; 142 break; 143 case SHADER_OPCODE_POW: 144 this->latency = 8 * chans * math_latency; 145 break; 146 case SHADER_OPCODE_SIN: 147 case SHADER_OPCODE_COS: 148 /* minimum latency, max is 12 rounds. */ 149 this->latency = 5 * chans * math_latency; 150 break; 151 default: 152 this->latency = 2; 153 break; 154 } 155} 156 157void 158schedule_node::set_latency_gfx7(bool is_haswell) 159{ 160 switch (inst->opcode) { 161 case BRW_OPCODE_MAD: 162 /* 2 cycles 163 * (since the last two src operands are in different register banks): 164 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 165 * 166 * 3 cycles on IVB, 4 on HSW 167 * (since the last two src operands are in the same register bank): 168 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 169 * 170 * 18 cycles on IVB, 16 on HSW 171 * (since the last two src operands are in different register banks): 172 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 173 * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q }; 174 * 175 * 20 cycles on IVB, 18 on HSW 176 * (since the last two src operands are in the same register bank): 177 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 178 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 179 */ 180 181 /* Our register allocator doesn't know about register banks, so use the 182 * higher latency. 183 */ 184 latency = is_haswell ? 16 : 18; 185 break; 186 187 case BRW_OPCODE_LRP: 188 /* 2 cycles 189 * (since the last two src operands are in different register banks): 190 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 191 * 192 * 3 cycles on IVB, 4 on HSW 193 * (since the last two src operands are in the same register bank): 194 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 195 * 196 * 16 cycles on IVB, 14 on HSW 197 * (since the last two src operands are in different register banks): 198 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 199 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 200 * 201 * 16 cycles 202 * (since the last two src operands are in the same register bank): 203 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 204 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 205 */ 206 207 /* Our register allocator doesn't know about register banks, so use the 208 * higher latency. 209 */ 210 latency = 14; 211 break; 212 213 case SHADER_OPCODE_RCP: 214 case SHADER_OPCODE_RSQ: 215 case SHADER_OPCODE_SQRT: 216 case SHADER_OPCODE_LOG2: 217 case SHADER_OPCODE_EXP2: 218 case SHADER_OPCODE_SIN: 219 case SHADER_OPCODE_COS: 220 /* 2 cycles: 221 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 222 * 223 * 18 cycles: 224 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 225 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 226 * 227 * Same for exp2, log2, rsq, sqrt, sin, cos. 228 */ 229 latency = is_haswell ? 14 : 16; 230 break; 231 232 case SHADER_OPCODE_POW: 233 /* 2 cycles: 234 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 235 * 236 * 26 cycles: 237 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 238 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 239 */ 240 latency = is_haswell ? 22 : 24; 241 break; 242 243 case SHADER_OPCODE_TEX: 244 case SHADER_OPCODE_TXD: 245 case SHADER_OPCODE_TXF: 246 case SHADER_OPCODE_TXF_LZ: 247 case SHADER_OPCODE_TXL: 248 case SHADER_OPCODE_TXL_LZ: 249 /* 18 cycles: 250 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 251 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 252 * send(8) g4<1>UW g114<8,8,1>F 253 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 254 * 255 * 697 +/-49 cycles (min 610, n=26): 256 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 257 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 258 * send(8) g4<1>UW g114<8,8,1>F 259 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 260 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 261 * 262 * So the latency on our first texture load of the batchbuffer takes 263 * ~700 cycles, since the caches are cold at that point. 264 * 265 * 840 +/- 92 cycles (min 720, n=25): 266 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 267 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 268 * send(8) g4<1>UW g114<8,8,1>F 269 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 270 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 271 * send(8) g4<1>UW g114<8,8,1>F 272 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 273 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 274 * 275 * On the second load, it takes just an extra ~140 cycles, and after 276 * accounting for the 14 cycles of the MOV's latency, that makes ~130. 277 * 278 * 683 +/- 49 cycles (min = 602, n=47): 279 * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 280 * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 281 * send(8) g4<1>UW g114<8,8,1>F 282 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 283 * send(8) g50<1>UW g114<8,8,1>F 284 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 285 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 286 * 287 * The unit appears to be pipelined, since this matches up with the 288 * cache-cold case, despite there being two loads here. If you replace 289 * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). 290 * 291 * So, take some number between the cache-hot 140 cycles and the 292 * cache-cold 700 cycles. No particular tuning was done on this. 293 * 294 * I haven't done significant testing of the non-TEX opcodes. TXL at 295 * least looked about the same as TEX. 296 */ 297 latency = 200; 298 break; 299 300 case SHADER_OPCODE_TXS: 301 /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 302 * cycles (n=15): 303 * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; 304 * send(8) g6<1>UW g114<8,8,1>F 305 * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; 306 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; 307 * 308 * 309 * Two loads was 535 +/- 30 cycles (n=19): 310 * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 311 * send(16) g6<1>UW g114<8,8,1>F 312 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 313 * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 314 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; 315 * send(16) g8<1>UW g114<8,8,1>F 316 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 317 * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; 318 * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; 319 * 320 * Since the only caches that should matter are just the 321 * instruction/state cache containing the surface state, assume that we 322 * always have hot caches. 323 */ 324 latency = 100; 325 break; 326 327 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: 328 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 329 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: 330 case VS_OPCODE_PULL_CONSTANT_LOAD: 331 /* testing using varying-index pull constants: 332 * 333 * 16 cycles: 334 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 335 * send(8) g4<1>F g4<8,8,1>D 336 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 337 * 338 * ~480 cycles: 339 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 340 * send(8) g4<1>F g4<8,8,1>D 341 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 342 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 343 * 344 * ~620 cycles: 345 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 346 * send(8) g4<1>F g4<8,8,1>D 347 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 348 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 349 * send(8) g4<1>F g4<8,8,1>D 350 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 351 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 352 * 353 * So, if it's cache-hot, it's about 140. If it's cache cold, it's 354 * about 460. We expect to mostly be cache hot, so pick something more 355 * in that direction. 356 */ 357 latency = 200; 358 break; 359 360 case SHADER_OPCODE_GFX7_SCRATCH_READ: 361 /* Testing a load from offset 0, that had been previously written: 362 * 363 * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q }; 364 * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q }; 365 * 366 * The cycles spent seemed to be grouped around 40-50 (as low as 38), 367 * then around 140. Presumably this is cache hit vs miss. 368 */ 369 latency = 50; 370 break; 371 372 case VEC4_OPCODE_UNTYPED_ATOMIC: 373 /* See GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 374 latency = 14000; 375 break; 376 377 case VEC4_OPCODE_UNTYPED_SURFACE_READ: 378 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: 379 /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 380 latency = is_haswell ? 300 : 600; 381 break; 382 383 case SHADER_OPCODE_SEND: 384 switch (inst->sfid) { 385 case BRW_SFID_SAMPLER: { 386 unsigned msg_type = (inst->desc >> 12) & 0x1f; 387 switch (msg_type) { 388 case GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO: 389 case GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO: 390 /* See also SHADER_OPCODE_TXS */ 391 latency = 100; 392 break; 393 394 default: 395 /* See also SHADER_OPCODE_TEX */ 396 latency = 200; 397 break; 398 } 399 break; 400 } 401 402 case GFX6_SFID_DATAPORT_RENDER_CACHE: 403 switch (brw_fb_desc_msg_type(devinfo, inst->desc)) { 404 case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE: 405 case GFX7_DATAPORT_RC_TYPED_SURFACE_READ: 406 /* See also SHADER_OPCODE_TYPED_SURFACE_READ */ 407 assert(!is_haswell); 408 latency = 600; 409 break; 410 411 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP: 412 /* See also SHADER_OPCODE_TYPED_ATOMIC */ 413 assert(!is_haswell); 414 latency = 14000; 415 break; 416 417 case GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE: 418 /* completely fabricated number */ 419 latency = 600; 420 break; 421 422 default: 423 unreachable("Unknown render cache message"); 424 } 425 break; 426 427 case GFX7_SFID_DATAPORT_DATA_CACHE: 428 switch ((inst->desc >> 14) & 0x1f) { 429 case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ: 430 case GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ: 431 case GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE: 432 /* We have no data for this but assume it's a little faster than 433 * untyped surface read/write. 434 */ 435 latency = 200; 436 break; 437 438 case GFX7_DATAPORT_DC_DWORD_SCATTERED_READ: 439 case GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE: 440 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ: 441 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE: 442 /* We have no data for this but assume it's roughly the same as 443 * untyped surface read/write. 444 */ 445 latency = 300; 446 break; 447 448 case GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ: 449 case GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE: 450 /* Test code: 451 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 452 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 453 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 454 * send(8) g4<1>UD g112<8,8,1>UD 455 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 456 * . 457 * . [repeats 8 times] 458 * . 459 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 460 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 461 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 462 * send(8) g4<1>UD g112<8,8,1>UD 463 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 464 * 465 * Running it 100 times as fragment shader on a 128x128 quad 466 * gives an average latency of 583 cycles per surface read, 467 * standard deviation 0.9%. 468 */ 469 assert(!is_haswell); 470 latency = 600; 471 break; 472 473 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP: 474 /* Test code: 475 * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; 476 * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; 477 * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; 478 * send(8) g4<1>ud g112<8,8,1>ud 479 * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; 480 * 481 * Running it 100 times as fragment shader on a 128x128 quad 482 * gives an average latency of 13867 cycles per atomic op, 483 * standard deviation 3%. Note that this is a rather 484 * pessimistic estimate, the actual latency in cases with few 485 * collisions between threads and favorable pipelining has been 486 * seen to be reduced by a factor of 100. 487 */ 488 assert(!is_haswell); 489 latency = 14000; 490 break; 491 492 default: 493 unreachable("Unknown data cache message"); 494 } 495 break; 496 497 case HSW_SFID_DATAPORT_DATA_CACHE_1: 498 switch ((inst->desc >> 14) & 0x1f) { 499 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ: 500 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: 501 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: 502 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: 503 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE: 504 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: 505 case GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE: 506 case GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ: 507 case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ: 508 case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE: 509 /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 510 latency = 300; 511 break; 512 513 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: 514 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: 515 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: 516 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: 517 case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP: 518 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP: 519 case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP: 520 case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP: 521 case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP: 522 /* See also GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 523 latency = 14000; 524 break; 525 526 default: 527 unreachable("Unknown data cache message"); 528 } 529 break; 530 531 case GFX12_SFID_UGM: 532 case GFX12_SFID_TGM: 533 case GFX12_SFID_SLM: 534 switch (lsc_msg_desc_opcode(devinfo, inst->desc)) { 535 case LSC_OP_LOAD: 536 case LSC_OP_STORE: 537 case LSC_OP_LOAD_CMASK: 538 case LSC_OP_STORE_CMASK: 539 latency = 300; 540 break; 541 case LSC_OP_FENCE: 542 case LSC_OP_ATOMIC_INC: 543 case LSC_OP_ATOMIC_DEC: 544 case LSC_OP_ATOMIC_LOAD: 545 case LSC_OP_ATOMIC_STORE: 546 case LSC_OP_ATOMIC_ADD: 547 case LSC_OP_ATOMIC_SUB: 548 case LSC_OP_ATOMIC_MIN: 549 case LSC_OP_ATOMIC_MAX: 550 case LSC_OP_ATOMIC_UMIN: 551 case LSC_OP_ATOMIC_UMAX: 552 case LSC_OP_ATOMIC_CMPXCHG: 553 case LSC_OP_ATOMIC_FADD: 554 case LSC_OP_ATOMIC_FSUB: 555 case LSC_OP_ATOMIC_FMIN: 556 case LSC_OP_ATOMIC_FMAX: 557 case LSC_OP_ATOMIC_FCMPXCHG: 558 case LSC_OP_ATOMIC_AND: 559 case LSC_OP_ATOMIC_OR: 560 case LSC_OP_ATOMIC_XOR: 561 latency = 1400; 562 break; 563 default: 564 unreachable("unsupported new data port message instruction"); 565 } 566 break; 567 568 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: 569 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: 570 /* TODO. 571 * 572 * We'll assume for the moment that this is pretty quick as it 573 * doesn't actually return any data. 574 */ 575 latency = 200; 576 break; 577 578 default: 579 unreachable("Unknown SFID"); 580 } 581 break; 582 583 default: 584 /* 2 cycles: 585 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 586 * 587 * 16 cycles: 588 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 589 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 590 */ 591 latency = 14; 592 break; 593 } 594} 595 596class instruction_scheduler { 597public: 598 instruction_scheduler(const backend_shader *s, int grf_count, 599 unsigned hw_reg_count, int block_count, 600 instruction_scheduler_mode mode): 601 bs(s) 602 { 603 this->mem_ctx = ralloc_context(NULL); 604 this->grf_count = grf_count; 605 this->hw_reg_count = hw_reg_count; 606 this->instructions.make_empty(); 607 this->post_reg_alloc = (mode == SCHEDULE_POST); 608 this->mode = mode; 609 this->reg_pressure = 0; 610 this->block_idx = 0; 611 if (!post_reg_alloc) { 612 this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); 613 614 this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 615 for (int i = 0; i < block_count; i++) 616 this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, 617 BITSET_WORDS(grf_count)); 618 619 this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 620 for (int i = 0; i < block_count; i++) 621 this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 622 BITSET_WORDS(grf_count)); 623 624 this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 625 for (int i = 0; i < block_count; i++) 626 this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 627 BITSET_WORDS(hw_reg_count)); 628 629 this->written = rzalloc_array(mem_ctx, bool, grf_count); 630 631 this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); 632 633 this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count); 634 } else { 635 this->reg_pressure_in = NULL; 636 this->livein = NULL; 637 this->liveout = NULL; 638 this->hw_liveout = NULL; 639 this->written = NULL; 640 this->reads_remaining = NULL; 641 this->hw_reads_remaining = NULL; 642 } 643 } 644 645 ~instruction_scheduler() 646 { 647 ralloc_free(this->mem_ctx); 648 } 649 void add_barrier_deps(schedule_node *n); 650 void add_dep(schedule_node *before, schedule_node *after, int latency); 651 void add_dep(schedule_node *before, schedule_node *after); 652 653 void run(cfg_t *cfg); 654 void add_insts_from_block(bblock_t *block); 655 void compute_delays(); 656 void compute_exits(); 657 virtual void calculate_deps() = 0; 658 virtual schedule_node *choose_instruction_to_schedule() = 0; 659 660 /** 661 * Returns how many cycles it takes the instruction to issue. 662 * 663 * Instructions in gen hardware are handled one simd4 vector at a time, 664 * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2 665 * cycles to dispatch and SIMD16 (compressed) instructions take 4. 666 */ 667 virtual int issue_time(backend_instruction *inst) = 0; 668 669 virtual void count_reads_remaining(backend_instruction *inst) = 0; 670 virtual void setup_liveness(cfg_t *cfg) = 0; 671 virtual void update_register_pressure(backend_instruction *inst) = 0; 672 virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; 673 674 void schedule_instructions(bblock_t *block); 675 676 void *mem_ctx; 677 678 bool post_reg_alloc; 679 int grf_count; 680 unsigned hw_reg_count; 681 int reg_pressure; 682 int block_idx; 683 exec_list instructions; 684 const backend_shader *bs; 685 686 instruction_scheduler_mode mode; 687 688 /* 689 * The register pressure at the beginning of each basic block. 690 */ 691 692 int *reg_pressure_in; 693 694 /* 695 * The virtual GRF's whose range overlaps the beginning of each basic block. 696 */ 697 698 BITSET_WORD **livein; 699 700 /* 701 * The virtual GRF's whose range overlaps the end of each basic block. 702 */ 703 704 BITSET_WORD **liveout; 705 706 /* 707 * The hardware GRF's whose range overlaps the end of each basic block. 708 */ 709 710 BITSET_WORD **hw_liveout; 711 712 /* 713 * Whether we've scheduled a write for this virtual GRF yet. 714 */ 715 716 bool *written; 717 718 /* 719 * How many reads we haven't scheduled for this virtual GRF yet. 720 */ 721 722 int *reads_remaining; 723 724 /* 725 * How many reads we haven't scheduled for this hardware GRF yet. 726 */ 727 728 int *hw_reads_remaining; 729}; 730 731class fs_instruction_scheduler : public instruction_scheduler 732{ 733public: 734 fs_instruction_scheduler(const fs_visitor *v, int grf_count, int hw_reg_count, 735 int block_count, 736 instruction_scheduler_mode mode); 737 void calculate_deps(); 738 bool is_compressed(const fs_inst *inst); 739 schedule_node *choose_instruction_to_schedule(); 740 int issue_time(backend_instruction *inst); 741 const fs_visitor *v; 742 743 void count_reads_remaining(backend_instruction *inst); 744 void setup_liveness(cfg_t *cfg); 745 void update_register_pressure(backend_instruction *inst); 746 int get_register_pressure_benefit(backend_instruction *inst); 747}; 748 749fs_instruction_scheduler::fs_instruction_scheduler(const fs_visitor *v, 750 int grf_count, int hw_reg_count, 751 int block_count, 752 instruction_scheduler_mode mode) 753 : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), 754 v(v) 755{ 756} 757 758static bool 759is_src_duplicate(fs_inst *inst, int src) 760{ 761 for (int i = 0; i < src; i++) 762 if (inst->src[i].equals(inst->src[src])) 763 return true; 764 765 return false; 766} 767 768void 769fs_instruction_scheduler::count_reads_remaining(backend_instruction *be) 770{ 771 fs_inst *inst = (fs_inst *)be; 772 773 if (!reads_remaining) 774 return; 775 776 for (int i = 0; i < inst->sources; i++) { 777 if (is_src_duplicate(inst, i)) 778 continue; 779 780 if (inst->src[i].file == VGRF) { 781 reads_remaining[inst->src[i].nr]++; 782 } else if (inst->src[i].file == FIXED_GRF) { 783 if (inst->src[i].nr >= hw_reg_count) 784 continue; 785 786 for (unsigned j = 0; j < regs_read(inst, i); j++) 787 hw_reads_remaining[inst->src[i].nr + j]++; 788 } 789 } 790} 791 792void 793fs_instruction_scheduler::setup_liveness(cfg_t *cfg) 794{ 795 const fs_live_variables &live = v->live_analysis.require(); 796 797 /* First, compute liveness on a per-GRF level using the in/out sets from 798 * liveness calculation. 799 */ 800 for (int block = 0; block < cfg->num_blocks; block++) { 801 for (int i = 0; i < live.num_vars; i++) { 802 if (BITSET_TEST(live.block_data[block].livein, i)) { 803 int vgrf = live.vgrf_from_var[i]; 804 if (!BITSET_TEST(livein[block], vgrf)) { 805 reg_pressure_in[block] += v->alloc.sizes[vgrf]; 806 BITSET_SET(livein[block], vgrf); 807 } 808 } 809 810 if (BITSET_TEST(live.block_data[block].liveout, i)) 811 BITSET_SET(liveout[block], live.vgrf_from_var[i]); 812 } 813 } 814 815 /* Now, extend the live in/live out sets for when a range crosses a block 816 * boundary, which matches what our register allocator/interference code 817 * does to account for force_writemask_all and incompatible exec_mask's. 818 */ 819 for (int block = 0; block < cfg->num_blocks - 1; block++) { 820 for (int i = 0; i < grf_count; i++) { 821 if (live.vgrf_start[i] <= cfg->blocks[block]->end_ip && 822 live.vgrf_end[i] >= cfg->blocks[block + 1]->start_ip) { 823 if (!BITSET_TEST(livein[block + 1], i)) { 824 reg_pressure_in[block + 1] += v->alloc.sizes[i]; 825 BITSET_SET(livein[block + 1], i); 826 } 827 828 BITSET_SET(liveout[block], i); 829 } 830 } 831 } 832 833 int payload_last_use_ip[hw_reg_count]; 834 v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); 835 836 for (unsigned i = 0; i < hw_reg_count; i++) { 837 if (payload_last_use_ip[i] == -1) 838 continue; 839 840 for (int block = 0; block < cfg->num_blocks; block++) { 841 if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) 842 reg_pressure_in[block]++; 843 844 if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) 845 BITSET_SET(hw_liveout[block], i); 846 } 847 } 848} 849 850void 851fs_instruction_scheduler::update_register_pressure(backend_instruction *be) 852{ 853 fs_inst *inst = (fs_inst *)be; 854 855 if (!reads_remaining) 856 return; 857 858 if (inst->dst.file == VGRF) { 859 written[inst->dst.nr] = true; 860 } 861 862 for (int i = 0; i < inst->sources; i++) { 863 if (is_src_duplicate(inst, i)) 864 continue; 865 866 if (inst->src[i].file == VGRF) { 867 reads_remaining[inst->src[i].nr]--; 868 } else if (inst->src[i].file == FIXED_GRF && 869 inst->src[i].nr < hw_reg_count) { 870 for (unsigned off = 0; off < regs_read(inst, i); off++) 871 hw_reads_remaining[inst->src[i].nr + off]--; 872 } 873 } 874} 875 876int 877fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) 878{ 879 fs_inst *inst = (fs_inst *)be; 880 int benefit = 0; 881 882 if (inst->dst.file == VGRF) { 883 if (!BITSET_TEST(livein[block_idx], inst->dst.nr) && 884 !written[inst->dst.nr]) 885 benefit -= v->alloc.sizes[inst->dst.nr]; 886 } 887 888 for (int i = 0; i < inst->sources; i++) { 889 if (is_src_duplicate(inst, i)) 890 continue; 891 892 if (inst->src[i].file == VGRF && 893 !BITSET_TEST(liveout[block_idx], inst->src[i].nr) && 894 reads_remaining[inst->src[i].nr] == 1) 895 benefit += v->alloc.sizes[inst->src[i].nr]; 896 897 if (inst->src[i].file == FIXED_GRF && 898 inst->src[i].nr < hw_reg_count) { 899 for (unsigned off = 0; off < regs_read(inst, i); off++) { 900 int reg = inst->src[i].nr + off; 901 if (!BITSET_TEST(hw_liveout[block_idx], reg) && 902 hw_reads_remaining[reg] == 1) { 903 benefit++; 904 } 905 } 906 } 907 } 908 909 return benefit; 910} 911 912class vec4_instruction_scheduler : public instruction_scheduler 913{ 914public: 915 vec4_instruction_scheduler(const vec4_visitor *v, int grf_count); 916 void calculate_deps(); 917 schedule_node *choose_instruction_to_schedule(); 918 int issue_time(backend_instruction *inst); 919 const vec4_visitor *v; 920 921 void count_reads_remaining(backend_instruction *inst); 922 void setup_liveness(cfg_t *cfg); 923 void update_register_pressure(backend_instruction *inst); 924 int get_register_pressure_benefit(backend_instruction *inst); 925}; 926 927vec4_instruction_scheduler::vec4_instruction_scheduler(const vec4_visitor *v, 928 int grf_count) 929 : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), 930 v(v) 931{ 932} 933 934void 935vec4_instruction_scheduler::count_reads_remaining(backend_instruction *) 936{ 937} 938 939void 940vec4_instruction_scheduler::setup_liveness(cfg_t *) 941{ 942} 943 944void 945vec4_instruction_scheduler::update_register_pressure(backend_instruction *) 946{ 947} 948 949int 950vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *) 951{ 952 return 0; 953} 954 955schedule_node::schedule_node(backend_instruction *inst, 956 instruction_scheduler *sched) 957{ 958 const struct intel_device_info *devinfo = sched->bs->devinfo; 959 960 this->devinfo = devinfo; 961 this->inst = inst; 962 this->child_array_size = 0; 963 this->children = NULL; 964 this->child_latency = NULL; 965 this->child_count = 0; 966 this->parent_count = 0; 967 this->unblocked_time = 0; 968 this->cand_generation = 0; 969 this->delay = 0; 970 this->exit = NULL; 971 972 /* We can't measure Gfx6 timings directly but expect them to be much 973 * closer to Gfx7 than Gfx4. 974 */ 975 if (!sched->post_reg_alloc) 976 this->latency = 1; 977 else if (devinfo->ver >= 6) 978 set_latency_gfx7(devinfo->is_haswell); 979 else 980 set_latency_gfx4(); 981} 982 983void 984instruction_scheduler::add_insts_from_block(bblock_t *block) 985{ 986 foreach_inst_in_block(backend_instruction, inst, block) { 987 schedule_node *n = new(mem_ctx) schedule_node(inst, this); 988 989 instructions.push_tail(n); 990 } 991} 992 993/** Computation of the delay member of each node. */ 994void 995instruction_scheduler::compute_delays() 996{ 997 foreach_in_list_reverse(schedule_node, n, &instructions) { 998 if (!n->child_count) { 999 n->delay = issue_time(n->inst); 1000 } else { 1001 for (int i = 0; i < n->child_count; i++) { 1002 assert(n->children[i]->delay); 1003 n->delay = MAX2(n->delay, n->latency + n->children[i]->delay); 1004 } 1005 } 1006 } 1007} 1008 1009void 1010instruction_scheduler::compute_exits() 1011{ 1012 /* Calculate a lower bound of the scheduling time of each node in the 1013 * graph. This is analogous to the node's critical path but calculated 1014 * from the top instead of from the bottom of the block. 1015 */ 1016 foreach_in_list(schedule_node, n, &instructions) { 1017 for (int i = 0; i < n->child_count; i++) { 1018 n->children[i]->unblocked_time = 1019 MAX2(n->children[i]->unblocked_time, 1020 n->unblocked_time + issue_time(n->inst) + n->child_latency[i]); 1021 } 1022 } 1023 1024 /* Calculate the exit of each node by induction based on the exit nodes of 1025 * its children. The preferred exit of a node is the one among the exit 1026 * nodes of its children which can be unblocked first according to the 1027 * optimistic unblocked time estimate calculated above. 1028 */ 1029 foreach_in_list_reverse(schedule_node, n, &instructions) { 1030 n->exit = (n->inst->opcode == BRW_OPCODE_HALT ? n : NULL); 1031 1032 for (int i = 0; i < n->child_count; i++) { 1033 if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n)) 1034 n->exit = n->children[i]->exit; 1035 } 1036 } 1037} 1038 1039/** 1040 * Add a dependency between two instruction nodes. 1041 * 1042 * The @after node will be scheduled after @before. We will try to 1043 * schedule it @latency cycles after @before, but no guarantees there. 1044 */ 1045void 1046instruction_scheduler::add_dep(schedule_node *before, schedule_node *after, 1047 int latency) 1048{ 1049 if (!before || !after) 1050 return; 1051 1052 assert(before != after); 1053 1054 for (int i = 0; i < before->child_count; i++) { 1055 if (before->children[i] == after) { 1056 before->child_latency[i] = MAX2(before->child_latency[i], latency); 1057 return; 1058 } 1059 } 1060 1061 if (before->child_array_size <= before->child_count) { 1062 if (before->child_array_size < 16) 1063 before->child_array_size = 16; 1064 else 1065 before->child_array_size *= 2; 1066 1067 before->children = reralloc(mem_ctx, before->children, 1068 schedule_node *, 1069 before->child_array_size); 1070 before->child_latency = reralloc(mem_ctx, before->child_latency, 1071 int, before->child_array_size); 1072 } 1073 1074 before->children[before->child_count] = after; 1075 before->child_latency[before->child_count] = latency; 1076 before->child_count++; 1077 after->parent_count++; 1078} 1079 1080void 1081instruction_scheduler::add_dep(schedule_node *before, schedule_node *after) 1082{ 1083 if (!before) 1084 return; 1085 1086 add_dep(before, after, before->latency); 1087} 1088 1089static bool 1090is_scheduling_barrier(const backend_instruction *inst) 1091{ 1092 return inst->opcode == SHADER_OPCODE_HALT_TARGET || 1093 inst->is_control_flow() || 1094 inst->has_side_effects(); 1095} 1096 1097/** 1098 * Sometimes we really want this node to execute after everything that 1099 * was before it and before everything that followed it. This adds 1100 * the deps to do so. 1101 */ 1102void 1103instruction_scheduler::add_barrier_deps(schedule_node *n) 1104{ 1105 schedule_node *prev = (schedule_node *)n->prev; 1106 schedule_node *next = (schedule_node *)n->next; 1107 1108 if (prev) { 1109 while (!prev->is_head_sentinel()) { 1110 add_dep(prev, n, 0); 1111 if (is_scheduling_barrier(prev->inst)) 1112 break; 1113 prev = (schedule_node *)prev->prev; 1114 } 1115 } 1116 1117 if (next) { 1118 while (!next->is_tail_sentinel()) { 1119 add_dep(n, next, 0); 1120 if (is_scheduling_barrier(next->inst)) 1121 break; 1122 next = (schedule_node *)next->next; 1123 } 1124 } 1125} 1126 1127/* instruction scheduling needs to be aware of when an MRF write 1128 * actually writes 2 MRFs. 1129 */ 1130bool 1131fs_instruction_scheduler::is_compressed(const fs_inst *inst) 1132{ 1133 return inst->exec_size == 16; 1134} 1135 1136void 1137fs_instruction_scheduler::calculate_deps() 1138{ 1139 /* Pre-register-allocation, this tracks the last write per VGRF offset. 1140 * After register allocation, reg_offsets are gone and we track individual 1141 * GRF registers. 1142 */ 1143 schedule_node **last_grf_write; 1144 schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)]; 1145 schedule_node *last_conditional_mod[8] = {}; 1146 schedule_node *last_accumulator_write = NULL; 1147 /* Fixed HW registers are assumed to be separate from the virtual 1148 * GRFs, so they can be tracked separately. We don't really write 1149 * to fixed GRFs much, so don't bother tracking them on a more 1150 * granular level. 1151 */ 1152 schedule_node *last_fixed_grf_write = NULL; 1153 1154 last_grf_write = (schedule_node **)calloc(sizeof(schedule_node *), grf_count * 16); 1155 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1156 1157 /* top-to-bottom dependencies: RAW and WAW. */ 1158 foreach_in_list(schedule_node, n, &instructions) { 1159 fs_inst *inst = (fs_inst *)n->inst; 1160 1161 if (is_scheduling_barrier(inst)) 1162 add_barrier_deps(n); 1163 1164 /* read-after-write deps. */ 1165 for (int i = 0; i < inst->sources; i++) { 1166 if (inst->src[i].file == VGRF) { 1167 if (post_reg_alloc) { 1168 for (unsigned r = 0; r < regs_read(inst, i); r++) 1169 add_dep(last_grf_write[inst->src[i].nr + r], n); 1170 } else { 1171 for (unsigned r = 0; r < regs_read(inst, i); r++) { 1172 add_dep(last_grf_write[inst->src[i].nr * 16 + 1173 inst->src[i].offset / REG_SIZE + r], n); 1174 } 1175 } 1176 } else if (inst->src[i].file == FIXED_GRF) { 1177 if (post_reg_alloc) { 1178 for (unsigned r = 0; r < regs_read(inst, i); r++) 1179 add_dep(last_grf_write[inst->src[i].nr + r], n); 1180 } else { 1181 add_dep(last_fixed_grf_write, n); 1182 } 1183 } else if (inst->src[i].is_accumulator()) { 1184 add_dep(last_accumulator_write, n); 1185 } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1186 add_barrier_deps(n); 1187 } 1188 } 1189 1190 if (inst->base_mrf != -1) { 1191 for (int i = 0; i < inst->mlen; i++) { 1192 /* It looks like the MRF regs are released in the send 1193 * instruction once it's sent, not when the result comes 1194 * back. 1195 */ 1196 add_dep(last_mrf_write[inst->base_mrf + i], n); 1197 } 1198 } 1199 1200 if (const unsigned mask = inst->flags_read(v->devinfo)) { 1201 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1202 1203 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1204 if (mask & (1 << i)) 1205 add_dep(last_conditional_mod[i], n); 1206 } 1207 } 1208 1209 if (inst->reads_accumulator_implicitly()) { 1210 add_dep(last_accumulator_write, n); 1211 } 1212 1213 /* write-after-write deps. */ 1214 if (inst->dst.file == VGRF) { 1215 if (post_reg_alloc) { 1216 for (unsigned r = 0; r < regs_written(inst); r++) { 1217 add_dep(last_grf_write[inst->dst.nr + r], n); 1218 last_grf_write[inst->dst.nr + r] = n; 1219 } 1220 } else { 1221 for (unsigned r = 0; r < regs_written(inst); r++) { 1222 add_dep(last_grf_write[inst->dst.nr * 16 + 1223 inst->dst.offset / REG_SIZE + r], n); 1224 last_grf_write[inst->dst.nr * 16 + 1225 inst->dst.offset / REG_SIZE + r] = n; 1226 } 1227 } 1228 } else if (inst->dst.file == MRF) { 1229 int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1230 1231 add_dep(last_mrf_write[reg], n); 1232 last_mrf_write[reg] = n; 1233 if (is_compressed(inst)) { 1234 if (inst->dst.nr & BRW_MRF_COMPR4) 1235 reg += 4; 1236 else 1237 reg++; 1238 add_dep(last_mrf_write[reg], n); 1239 last_mrf_write[reg] = n; 1240 } 1241 } else if (inst->dst.file == FIXED_GRF) { 1242 if (post_reg_alloc) { 1243 for (unsigned r = 0; r < regs_written(inst); r++) { 1244 add_dep(last_grf_write[inst->dst.nr + r], n); 1245 last_grf_write[inst->dst.nr + r] = n; 1246 } 1247 } else { 1248 add_dep(last_fixed_grf_write, n); 1249 last_fixed_grf_write = n; 1250 } 1251 } else if (inst->dst.is_accumulator()) { 1252 add_dep(last_accumulator_write, n); 1253 last_accumulator_write = n; 1254 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1255 add_barrier_deps(n); 1256 } 1257 1258 if (inst->mlen > 0 && inst->base_mrf != -1) { 1259 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1260 add_dep(last_mrf_write[inst->base_mrf + i], n); 1261 last_mrf_write[inst->base_mrf + i] = n; 1262 } 1263 } 1264 1265 if (const unsigned mask = inst->flags_written(v->devinfo)) { 1266 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1267 1268 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1269 if (mask & (1 << i)) { 1270 add_dep(last_conditional_mod[i], n, 0); 1271 last_conditional_mod[i] = n; 1272 } 1273 } 1274 } 1275 1276 if (inst->writes_accumulator_implicitly(v->devinfo) && 1277 !inst->dst.is_accumulator()) { 1278 add_dep(last_accumulator_write, n); 1279 last_accumulator_write = n; 1280 } 1281 } 1282 1283 /* bottom-to-top dependencies: WAR */ 1284 memset(last_grf_write, 0, sizeof(schedule_node *) * grf_count * 16); 1285 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1286 memset(last_conditional_mod, 0, sizeof(last_conditional_mod)); 1287 last_accumulator_write = NULL; 1288 last_fixed_grf_write = NULL; 1289 1290 foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1291 fs_inst *inst = (fs_inst *)n->inst; 1292 1293 /* write-after-read deps. */ 1294 for (int i = 0; i < inst->sources; i++) { 1295 if (inst->src[i].file == VGRF) { 1296 if (post_reg_alloc) { 1297 for (unsigned r = 0; r < regs_read(inst, i); r++) 1298 add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1299 } else { 1300 for (unsigned r = 0; r < regs_read(inst, i); r++) { 1301 add_dep(n, last_grf_write[inst->src[i].nr * 16 + 1302 inst->src[i].offset / REG_SIZE + r], 0); 1303 } 1304 } 1305 } else if (inst->src[i].file == FIXED_GRF) { 1306 if (post_reg_alloc) { 1307 for (unsigned r = 0; r < regs_read(inst, i); r++) 1308 add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1309 } else { 1310 add_dep(n, last_fixed_grf_write, 0); 1311 } 1312 } else if (inst->src[i].is_accumulator()) { 1313 add_dep(n, last_accumulator_write, 0); 1314 } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1315 add_barrier_deps(n); 1316 } 1317 } 1318 1319 if (inst->base_mrf != -1) { 1320 for (int i = 0; i < inst->mlen; i++) { 1321 /* It looks like the MRF regs are released in the send 1322 * instruction once it's sent, not when the result comes 1323 * back. 1324 */ 1325 add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1326 } 1327 } 1328 1329 if (const unsigned mask = inst->flags_read(v->devinfo)) { 1330 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1331 1332 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1333 if (mask & (1 << i)) 1334 add_dep(n, last_conditional_mod[i]); 1335 } 1336 } 1337 1338 if (inst->reads_accumulator_implicitly()) { 1339 add_dep(n, last_accumulator_write); 1340 } 1341 1342 /* Update the things this instruction wrote, so earlier reads 1343 * can mark this as WAR dependency. 1344 */ 1345 if (inst->dst.file == VGRF) { 1346 if (post_reg_alloc) { 1347 for (unsigned r = 0; r < regs_written(inst); r++) 1348 last_grf_write[inst->dst.nr + r] = n; 1349 } else { 1350 for (unsigned r = 0; r < regs_written(inst); r++) { 1351 last_grf_write[inst->dst.nr * 16 + 1352 inst->dst.offset / REG_SIZE + r] = n; 1353 } 1354 } 1355 } else if (inst->dst.file == MRF) { 1356 int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1357 1358 last_mrf_write[reg] = n; 1359 1360 if (is_compressed(inst)) { 1361 if (inst->dst.nr & BRW_MRF_COMPR4) 1362 reg += 4; 1363 else 1364 reg++; 1365 1366 last_mrf_write[reg] = n; 1367 } 1368 } else if (inst->dst.file == FIXED_GRF) { 1369 if (post_reg_alloc) { 1370 for (unsigned r = 0; r < regs_written(inst); r++) 1371 last_grf_write[inst->dst.nr + r] = n; 1372 } else { 1373 last_fixed_grf_write = n; 1374 } 1375 } else if (inst->dst.is_accumulator()) { 1376 last_accumulator_write = n; 1377 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1378 add_barrier_deps(n); 1379 } 1380 1381 if (inst->mlen > 0 && inst->base_mrf != -1) { 1382 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1383 last_mrf_write[inst->base_mrf + i] = n; 1384 } 1385 } 1386 1387 if (const unsigned mask = inst->flags_written(v->devinfo)) { 1388 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1389 1390 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1391 if (mask & (1 << i)) 1392 last_conditional_mod[i] = n; 1393 } 1394 } 1395 1396 if (inst->writes_accumulator_implicitly(v->devinfo)) { 1397 last_accumulator_write = n; 1398 } 1399 } 1400 1401 free(last_grf_write); 1402} 1403 1404void 1405vec4_instruction_scheduler::calculate_deps() 1406{ 1407 schedule_node *last_grf_write[grf_count]; 1408 schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)]; 1409 schedule_node *last_conditional_mod = NULL; 1410 schedule_node *last_accumulator_write = NULL; 1411 /* Fixed HW registers are assumed to be separate from the virtual 1412 * GRFs, so they can be tracked separately. We don't really write 1413 * to fixed GRFs much, so don't bother tracking them on a more 1414 * granular level. 1415 */ 1416 schedule_node *last_fixed_grf_write = NULL; 1417 1418 memset(last_grf_write, 0, sizeof(last_grf_write)); 1419 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1420 1421 /* top-to-bottom dependencies: RAW and WAW. */ 1422 foreach_in_list(schedule_node, n, &instructions) { 1423 vec4_instruction *inst = (vec4_instruction *)n->inst; 1424 1425 if (is_scheduling_barrier(inst)) 1426 add_barrier_deps(n); 1427 1428 /* read-after-write deps. */ 1429 for (int i = 0; i < 3; i++) { 1430 if (inst->src[i].file == VGRF) { 1431 for (unsigned j = 0; j < regs_read(inst, i); ++j) 1432 add_dep(last_grf_write[inst->src[i].nr + j], n); 1433 } else if (inst->src[i].file == FIXED_GRF) { 1434 add_dep(last_fixed_grf_write, n); 1435 } else if (inst->src[i].is_accumulator()) { 1436 assert(last_accumulator_write); 1437 add_dep(last_accumulator_write, n); 1438 } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1439 add_barrier_deps(n); 1440 } 1441 } 1442 1443 if (inst->reads_g0_implicitly()) 1444 add_dep(last_fixed_grf_write, n); 1445 1446 if (!inst->is_send_from_grf()) { 1447 for (int i = 0; i < inst->mlen; i++) { 1448 /* It looks like the MRF regs are released in the send 1449 * instruction once it's sent, not when the result comes 1450 * back. 1451 */ 1452 add_dep(last_mrf_write[inst->base_mrf + i], n); 1453 } 1454 } 1455 1456 if (inst->reads_flag()) { 1457 assert(last_conditional_mod); 1458 add_dep(last_conditional_mod, n); 1459 } 1460 1461 if (inst->reads_accumulator_implicitly()) { 1462 assert(last_accumulator_write); 1463 add_dep(last_accumulator_write, n); 1464 } 1465 1466 /* write-after-write deps. */ 1467 if (inst->dst.file == VGRF) { 1468 for (unsigned j = 0; j < regs_written(inst); ++j) { 1469 add_dep(last_grf_write[inst->dst.nr + j], n); 1470 last_grf_write[inst->dst.nr + j] = n; 1471 } 1472 } else if (inst->dst.file == MRF) { 1473 add_dep(last_mrf_write[inst->dst.nr], n); 1474 last_mrf_write[inst->dst.nr] = n; 1475 } else if (inst->dst.file == FIXED_GRF) { 1476 add_dep(last_fixed_grf_write, n); 1477 last_fixed_grf_write = n; 1478 } else if (inst->dst.is_accumulator()) { 1479 add_dep(last_accumulator_write, n); 1480 last_accumulator_write = n; 1481 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1482 add_barrier_deps(n); 1483 } 1484 1485 if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1486 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1487 add_dep(last_mrf_write[inst->base_mrf + i], n); 1488 last_mrf_write[inst->base_mrf + i] = n; 1489 } 1490 } 1491 1492 if (inst->writes_flag(v->devinfo)) { 1493 add_dep(last_conditional_mod, n, 0); 1494 last_conditional_mod = n; 1495 } 1496 1497 if (inst->writes_accumulator_implicitly(v->devinfo) && 1498 !inst->dst.is_accumulator()) { 1499 add_dep(last_accumulator_write, n); 1500 last_accumulator_write = n; 1501 } 1502 } 1503 1504 /* bottom-to-top dependencies: WAR */ 1505 memset(last_grf_write, 0, sizeof(last_grf_write)); 1506 memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1507 last_conditional_mod = NULL; 1508 last_accumulator_write = NULL; 1509 last_fixed_grf_write = NULL; 1510 1511 foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1512 vec4_instruction *inst = (vec4_instruction *)n->inst; 1513 1514 /* write-after-read deps. */ 1515 for (int i = 0; i < 3; i++) { 1516 if (inst->src[i].file == VGRF) { 1517 for (unsigned j = 0; j < regs_read(inst, i); ++j) 1518 add_dep(n, last_grf_write[inst->src[i].nr + j]); 1519 } else if (inst->src[i].file == FIXED_GRF) { 1520 add_dep(n, last_fixed_grf_write); 1521 } else if (inst->src[i].is_accumulator()) { 1522 add_dep(n, last_accumulator_write); 1523 } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 1524 add_barrier_deps(n); 1525 } 1526 } 1527 1528 if (!inst->is_send_from_grf()) { 1529 for (int i = 0; i < inst->mlen; i++) { 1530 /* It looks like the MRF regs are released in the send 1531 * instruction once it's sent, not when the result comes 1532 * back. 1533 */ 1534 add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1535 } 1536 } 1537 1538 if (inst->reads_flag()) { 1539 add_dep(n, last_conditional_mod); 1540 } 1541 1542 if (inst->reads_accumulator_implicitly()) { 1543 add_dep(n, last_accumulator_write); 1544 } 1545 1546 /* Update the things this instruction wrote, so earlier reads 1547 * can mark this as WAR dependency. 1548 */ 1549 if (inst->dst.file == VGRF) { 1550 for (unsigned j = 0; j < regs_written(inst); ++j) 1551 last_grf_write[inst->dst.nr + j] = n; 1552 } else if (inst->dst.file == MRF) { 1553 last_mrf_write[inst->dst.nr] = n; 1554 } else if (inst->dst.file == FIXED_GRF) { 1555 last_fixed_grf_write = n; 1556 } else if (inst->dst.is_accumulator()) { 1557 last_accumulator_write = n; 1558 } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1559 add_barrier_deps(n); 1560 } 1561 1562 if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1563 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 1564 last_mrf_write[inst->base_mrf + i] = n; 1565 } 1566 } 1567 1568 if (inst->writes_flag(v->devinfo)) { 1569 last_conditional_mod = n; 1570 } 1571 1572 if (inst->writes_accumulator_implicitly(v->devinfo)) { 1573 last_accumulator_write = n; 1574 } 1575 } 1576} 1577 1578schedule_node * 1579fs_instruction_scheduler::choose_instruction_to_schedule() 1580{ 1581 schedule_node *chosen = NULL; 1582 1583 if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) { 1584 int chosen_time = 0; 1585 1586 /* Of the instructions ready to execute or the closest to being ready, 1587 * choose the one most likely to unblock an early program exit, or 1588 * otherwise the oldest one. 1589 */ 1590 foreach_in_list(schedule_node, n, &instructions) { 1591 if (!chosen || 1592 exit_unblocked_time(n) < exit_unblocked_time(chosen) || 1593 (exit_unblocked_time(n) == exit_unblocked_time(chosen) && 1594 n->unblocked_time < chosen_time)) { 1595 chosen = n; 1596 chosen_time = n->unblocked_time; 1597 } 1598 } 1599 } else { 1600 int chosen_register_pressure_benefit = 0; 1601 1602 /* Before register allocation, we don't care about the latencies of 1603 * instructions. All we care about is reducing live intervals of 1604 * variables so that we can avoid register spilling, or get SIMD16 1605 * shaders which naturally do a better job of hiding instruction 1606 * latency. 1607 */ 1608 foreach_in_list(schedule_node, n, &instructions) { 1609 fs_inst *inst = (fs_inst *)n->inst; 1610 1611 if (!chosen) { 1612 chosen = n; 1613 chosen_register_pressure_benefit = 1614 get_register_pressure_benefit(chosen->inst); 1615 continue; 1616 } 1617 1618 /* Most important: If we can definitely reduce register pressure, do 1619 * so immediately. 1620 */ 1621 int register_pressure_benefit = get_register_pressure_benefit(n->inst); 1622 1623 if (register_pressure_benefit > 0 && 1624 register_pressure_benefit > chosen_register_pressure_benefit) { 1625 chosen = n; 1626 chosen_register_pressure_benefit = register_pressure_benefit; 1627 continue; 1628 } else if (chosen_register_pressure_benefit > 0 && 1629 (register_pressure_benefit < 1630 chosen_register_pressure_benefit)) { 1631 continue; 1632 } 1633 1634 if (mode == SCHEDULE_PRE_LIFO) { 1635 /* Prefer instructions that recently became available for 1636 * scheduling. These are the things that are most likely to 1637 * (eventually) make a variable dead and reduce register pressure. 1638 * Typical register pressure estimates don't work for us because 1639 * most of our pressure comes from texturing, where no single 1640 * instruction to schedule will make a vec4 value dead. 1641 */ 1642 if (n->cand_generation > chosen->cand_generation) { 1643 chosen = n; 1644 chosen_register_pressure_benefit = register_pressure_benefit; 1645 continue; 1646 } else if (n->cand_generation < chosen->cand_generation) { 1647 continue; 1648 } 1649 1650 /* On MRF-using chips, prefer non-SEND instructions. If we don't 1651 * do this, then because we prefer instructions that just became 1652 * candidates, we'll end up in a pattern of scheduling a SEND, 1653 * then the MRFs for the next SEND, then the next SEND, then the 1654 * MRFs, etc., without ever consuming the results of a send. 1655 */ 1656 if (v->devinfo->ver < 7) { 1657 fs_inst *chosen_inst = (fs_inst *)chosen->inst; 1658 1659 /* We use size_written > 4 * exec_size as our test for the kind 1660 * of send instruction to avoid -- only sends generate many 1661 * regs, and a single-result send is probably actually reducing 1662 * register pressure. 1663 */ 1664 if (inst->size_written <= 4 * inst->exec_size && 1665 chosen_inst->size_written > 4 * chosen_inst->exec_size) { 1666 chosen = n; 1667 chosen_register_pressure_benefit = register_pressure_benefit; 1668 continue; 1669 } else if (inst->size_written > chosen_inst->size_written) { 1670 continue; 1671 } 1672 } 1673 } 1674 1675 /* For instructions pushed on the cands list at the same time, prefer 1676 * the one with the highest delay to the end of the program. This is 1677 * most likely to have its values able to be consumed first (such as 1678 * for a large tree of lowered ubo loads, which appear reversed in 1679 * the instruction stream with respect to when they can be consumed). 1680 */ 1681 if (n->delay > chosen->delay) { 1682 chosen = n; 1683 chosen_register_pressure_benefit = register_pressure_benefit; 1684 continue; 1685 } else if (n->delay < chosen->delay) { 1686 continue; 1687 } 1688 1689 /* Prefer the node most likely to unblock an early program exit. 1690 */ 1691 if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) { 1692 chosen = n; 1693 chosen_register_pressure_benefit = register_pressure_benefit; 1694 continue; 1695 } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) { 1696 continue; 1697 } 1698 1699 /* If all other metrics are equal, we prefer the first instruction in 1700 * the list (program execution). 1701 */ 1702 } 1703 } 1704 1705 return chosen; 1706} 1707 1708schedule_node * 1709vec4_instruction_scheduler::choose_instruction_to_schedule() 1710{ 1711 schedule_node *chosen = NULL; 1712 int chosen_time = 0; 1713 1714 /* Of the instructions ready to execute or the closest to being ready, 1715 * choose the oldest one. 1716 */ 1717 foreach_in_list(schedule_node, n, &instructions) { 1718 if (!chosen || n->unblocked_time < chosen_time) { 1719 chosen = n; 1720 chosen_time = n->unblocked_time; 1721 } 1722 } 1723 1724 return chosen; 1725} 1726 1727int 1728fs_instruction_scheduler::issue_time(backend_instruction *inst0) 1729{ 1730 const fs_inst *inst = static_cast<fs_inst *>(inst0); 1731 const unsigned overhead = v->grf_used && has_bank_conflict(v->devinfo, inst) ? 1732 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE) : 0; 1733 if (is_compressed(inst)) 1734 return 4 + overhead; 1735 else 1736 return 2 + overhead; 1737} 1738 1739int 1740vec4_instruction_scheduler::issue_time(backend_instruction *) 1741{ 1742 /* We always execute as two vec4s in parallel. */ 1743 return 2; 1744} 1745 1746void 1747instruction_scheduler::schedule_instructions(bblock_t *block) 1748{ 1749 const struct intel_device_info *devinfo = bs->devinfo; 1750 int time = 0; 1751 int instructions_to_schedule = block->end_ip - block->start_ip + 1; 1752 1753 if (!post_reg_alloc) 1754 reg_pressure = reg_pressure_in[block->num]; 1755 block_idx = block->num; 1756 1757 /* Remove non-DAG heads from the list. */ 1758 foreach_in_list_safe(schedule_node, n, &instructions) { 1759 if (n->parent_count != 0) 1760 n->remove(); 1761 } 1762 1763 unsigned cand_generation = 1; 1764 while (!instructions.is_empty()) { 1765 schedule_node *chosen = choose_instruction_to_schedule(); 1766 1767 /* Schedule this instruction. */ 1768 assert(chosen); 1769 chosen->remove(); 1770 chosen->inst->exec_node::remove(); 1771 block->instructions.push_tail(chosen->inst); 1772 instructions_to_schedule--; 1773 1774 if (!post_reg_alloc) { 1775 reg_pressure -= get_register_pressure_benefit(chosen->inst); 1776 update_register_pressure(chosen->inst); 1777 } 1778 1779 /* If we expected a delay for scheduling, then bump the clock to reflect 1780 * that. In reality, the hardware will switch to another hyperthread 1781 * and may not return to dispatching our thread for a while even after 1782 * we're unblocked. After this, we have the time when the chosen 1783 * instruction will start executing. 1784 */ 1785 time = MAX2(time, chosen->unblocked_time); 1786 1787 /* Update the clock for how soon an instruction could start after the 1788 * chosen one. 1789 */ 1790 time += issue_time(chosen->inst); 1791 1792 if (debug) { 1793 fprintf(stderr, "clock %4d, scheduled: ", time); 1794 bs->dump_instruction(chosen->inst); 1795 if (!post_reg_alloc) 1796 fprintf(stderr, "(register pressure %d)\n", reg_pressure); 1797 } 1798 1799 /* Now that we've scheduled a new instruction, some of its 1800 * children can be promoted to the list of instructions ready to 1801 * be scheduled. Update the children's unblocked time for this 1802 * DAG edge as we do so. 1803 */ 1804 for (int i = chosen->child_count - 1; i >= 0; i--) { 1805 schedule_node *child = chosen->children[i]; 1806 1807 child->unblocked_time = MAX2(child->unblocked_time, 1808 time + chosen->child_latency[i]); 1809 1810 if (debug) { 1811 fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count); 1812 bs->dump_instruction(child->inst); 1813 } 1814 1815 child->cand_generation = cand_generation; 1816 child->parent_count--; 1817 if (child->parent_count == 0) { 1818 if (debug) { 1819 fprintf(stderr, "\t\tnow available\n"); 1820 } 1821 instructions.push_head(child); 1822 } 1823 } 1824 cand_generation++; 1825 1826 /* Shared resource: the mathbox. There's one mathbox per EU on Gfx6+ 1827 * but it's more limited pre-gfx6, so if we send something off to it then 1828 * the next math instruction isn't going to make progress until the first 1829 * is done. 1830 */ 1831 if (devinfo->ver < 6 && chosen->inst->is_math()) { 1832 foreach_in_list(schedule_node, n, &instructions) { 1833 if (n->inst->is_math()) 1834 n->unblocked_time = MAX2(n->unblocked_time, 1835 time + chosen->latency); 1836 } 1837 } 1838 } 1839 1840 assert(instructions_to_schedule == 0); 1841} 1842 1843void 1844instruction_scheduler::run(cfg_t *cfg) 1845{ 1846 if (debug && !post_reg_alloc) { 1847 fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", 1848 post_reg_alloc); 1849 bs->dump_instructions(); 1850 } 1851 1852 if (!post_reg_alloc) 1853 setup_liveness(cfg); 1854 1855 foreach_block(block, cfg) { 1856 if (reads_remaining) { 1857 memset(reads_remaining, 0, 1858 grf_count * sizeof(*reads_remaining)); 1859 memset(hw_reads_remaining, 0, 1860 hw_reg_count * sizeof(*hw_reads_remaining)); 1861 memset(written, 0, grf_count * sizeof(*written)); 1862 1863 foreach_inst_in_block(fs_inst, inst, block) 1864 count_reads_remaining(inst); 1865 } 1866 1867 add_insts_from_block(block); 1868 1869 calculate_deps(); 1870 1871 compute_delays(); 1872 compute_exits(); 1873 1874 schedule_instructions(block); 1875 } 1876 1877 if (debug && !post_reg_alloc) { 1878 fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", 1879 post_reg_alloc); 1880 bs->dump_instructions(); 1881 } 1882} 1883 1884void 1885fs_visitor::schedule_instructions(instruction_scheduler_mode mode) 1886{ 1887 int grf_count; 1888 if (mode == SCHEDULE_POST) 1889 grf_count = grf_used; 1890 else 1891 grf_count = alloc.count; 1892 1893 fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, 1894 cfg->num_blocks, mode); 1895 sched.run(cfg); 1896 1897 invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 1898} 1899 1900void 1901vec4_visitor::opt_schedule_instructions() 1902{ 1903 vec4_instruction_scheduler sched(this, prog_data->total_grf); 1904 sched.run(cfg); 1905 1906 invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 1907} 1908