1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2010 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg * 23b8e80941Smrg * Authors: 24b8e80941Smrg * Eric Anholt <eric@anholt.net> 25b8e80941Smrg * 26b8e80941Smrg */ 27b8e80941Smrg 28b8e80941Smrg#include "brw_fs.h" 29b8e80941Smrg#include "brw_fs_live_variables.h" 30b8e80941Smrg#include "brw_vec4.h" 31b8e80941Smrg#include "brw_cfg.h" 32b8e80941Smrg#include "brw_shader.h" 33b8e80941Smrg 34b8e80941Smrgusing namespace brw; 35b8e80941Smrg 36b8e80941Smrg/** @file brw_fs_schedule_instructions.cpp 37b8e80941Smrg * 38b8e80941Smrg * List scheduling of FS instructions. 39b8e80941Smrg * 40b8e80941Smrg * The basic model of the list scheduler is to take a basic block, 41b8e80941Smrg * compute a DAG of the dependencies (RAW ordering with latency, WAW 42b8e80941Smrg * ordering with latency, WAR ordering), and make a list of the DAG heads. 43b8e80941Smrg * Heuristically pick a DAG head, then put all the children that are 44b8e80941Smrg * now DAG heads into the list of things to schedule. 45b8e80941Smrg * 46b8e80941Smrg * The heuristic is the important part. We're trying to be cheap, 47b8e80941Smrg * since actually computing the optimal scheduling is NP complete. 48b8e80941Smrg * What we do is track a "current clock". When we schedule a node, we 49b8e80941Smrg * update the earliest-unblocked clock time of its children, and 50b8e80941Smrg * increment the clock. Then, when trying to schedule, we just pick 51b8e80941Smrg * the earliest-unblocked instruction to schedule. 52b8e80941Smrg * 53b8e80941Smrg * Note that often there will be many things which could execute 54b8e80941Smrg * immediately, and there are a range of heuristic options to choose 55b8e80941Smrg * from in picking among those. 56b8e80941Smrg */ 57b8e80941Smrg 58b8e80941Smrgstatic bool debug = false; 59b8e80941Smrg 60b8e80941Smrgclass instruction_scheduler; 61b8e80941Smrg 62b8e80941Smrgclass schedule_node : public exec_node 63b8e80941Smrg{ 64b8e80941Smrgpublic: 65b8e80941Smrg schedule_node(backend_instruction *inst, instruction_scheduler *sched); 66b8e80941Smrg void set_latency_gen4(); 67b8e80941Smrg void set_latency_gen7(bool is_haswell); 68b8e80941Smrg 69b8e80941Smrg backend_instruction *inst; 70b8e80941Smrg schedule_node **children; 71b8e80941Smrg int *child_latency; 72b8e80941Smrg int child_count; 73b8e80941Smrg int parent_count; 74b8e80941Smrg int child_array_size; 75b8e80941Smrg int unblocked_time; 76b8e80941Smrg int latency; 77b8e80941Smrg 78b8e80941Smrg /** 79b8e80941Smrg * Which iteration of pushing groups of children onto the candidates list 80b8e80941Smrg * this node was a part of. 81b8e80941Smrg */ 82b8e80941Smrg unsigned cand_generation; 83b8e80941Smrg 84b8e80941Smrg /** 85b8e80941Smrg * This is the sum of the instruction's latency plus the maximum delay of 86b8e80941Smrg * its children, or just the issue_time if it's a leaf node. 87b8e80941Smrg */ 88b8e80941Smrg int delay; 89b8e80941Smrg 90b8e80941Smrg /** 91b8e80941Smrg * Preferred exit node among the (direct or indirect) successors of this 92b8e80941Smrg * node. Among the scheduler nodes blocked by this node, this will be the 93b8e80941Smrg * one that may cause earliest program termination, or NULL if none of the 94b8e80941Smrg * successors is an exit node. 95b8e80941Smrg */ 96b8e80941Smrg schedule_node *exit; 97b8e80941Smrg}; 98b8e80941Smrg 99b8e80941Smrg/** 100b8e80941Smrg * Lower bound of the scheduling time after which one of the instructions 101b8e80941Smrg * blocked by this node may lead to program termination. 102b8e80941Smrg * 103b8e80941Smrg * exit_unblocked_time() determines a strict partial ordering relation '«' on 104b8e80941Smrg * the set of scheduler nodes as follows: 105b8e80941Smrg * 106b8e80941Smrg * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m) 107b8e80941Smrg * 108b8e80941Smrg * which can be used to heuristically order nodes according to how early they 109b8e80941Smrg * can unblock an exit node and lead to program termination. 110b8e80941Smrg */ 111b8e80941Smrgstatic inline int 112b8e80941Smrgexit_unblocked_time(const schedule_node *n) 113b8e80941Smrg{ 114b8e80941Smrg return n->exit ? n->exit->unblocked_time : INT_MAX; 115b8e80941Smrg} 116b8e80941Smrg 117b8e80941Smrgvoid 118b8e80941Smrgschedule_node::set_latency_gen4() 119b8e80941Smrg{ 120b8e80941Smrg int chans = 8; 121b8e80941Smrg int math_latency = 22; 122b8e80941Smrg 123b8e80941Smrg switch (inst->opcode) { 124b8e80941Smrg case SHADER_OPCODE_RCP: 125b8e80941Smrg this->latency = 1 * chans * math_latency; 126b8e80941Smrg break; 127b8e80941Smrg case SHADER_OPCODE_RSQ: 128b8e80941Smrg this->latency = 2 * chans * math_latency; 129b8e80941Smrg break; 130b8e80941Smrg case SHADER_OPCODE_INT_QUOTIENT: 131b8e80941Smrg case SHADER_OPCODE_SQRT: 132b8e80941Smrg case SHADER_OPCODE_LOG2: 133b8e80941Smrg /* full precision log. partial is 2. */ 134b8e80941Smrg this->latency = 3 * chans * math_latency; 135b8e80941Smrg break; 136b8e80941Smrg case SHADER_OPCODE_INT_REMAINDER: 137b8e80941Smrg case SHADER_OPCODE_EXP2: 138b8e80941Smrg /* full precision. partial is 3, same throughput. */ 139b8e80941Smrg this->latency = 4 * chans * math_latency; 140b8e80941Smrg break; 141b8e80941Smrg case SHADER_OPCODE_POW: 142b8e80941Smrg this->latency = 8 * chans * math_latency; 143b8e80941Smrg break; 144b8e80941Smrg case SHADER_OPCODE_SIN: 145b8e80941Smrg case SHADER_OPCODE_COS: 146b8e80941Smrg /* minimum latency, max is 12 rounds. */ 147b8e80941Smrg this->latency = 5 * chans * math_latency; 148b8e80941Smrg break; 149b8e80941Smrg default: 150b8e80941Smrg this->latency = 2; 151b8e80941Smrg break; 152b8e80941Smrg } 153b8e80941Smrg} 154b8e80941Smrg 155b8e80941Smrgvoid 156b8e80941Smrgschedule_node::set_latency_gen7(bool is_haswell) 157b8e80941Smrg{ 158b8e80941Smrg switch (inst->opcode) { 159b8e80941Smrg case BRW_OPCODE_MAD: 160b8e80941Smrg /* 2 cycles 161b8e80941Smrg * (since the last two src operands are in different register banks): 162b8e80941Smrg * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 163b8e80941Smrg * 164b8e80941Smrg * 3 cycles on IVB, 4 on HSW 165b8e80941Smrg * (since the last two src operands are in the same register bank): 166b8e80941Smrg * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 167b8e80941Smrg * 168b8e80941Smrg * 18 cycles on IVB, 16 on HSW 169b8e80941Smrg * (since the last two src operands are in different register banks): 170b8e80941Smrg * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 171b8e80941Smrg * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q }; 172b8e80941Smrg * 173b8e80941Smrg * 20 cycles on IVB, 18 on HSW 174b8e80941Smrg * (since the last two src operands are in the same register bank): 175b8e80941Smrg * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 176b8e80941Smrg * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 177b8e80941Smrg */ 178b8e80941Smrg 179b8e80941Smrg /* Our register allocator doesn't know about register banks, so use the 180b8e80941Smrg * higher latency. 181b8e80941Smrg */ 182b8e80941Smrg latency = is_haswell ? 16 : 18; 183b8e80941Smrg break; 184b8e80941Smrg 185b8e80941Smrg case BRW_OPCODE_LRP: 186b8e80941Smrg /* 2 cycles 187b8e80941Smrg * (since the last two src operands are in different register banks): 188b8e80941Smrg * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 189b8e80941Smrg * 190b8e80941Smrg * 3 cycles on IVB, 4 on HSW 191b8e80941Smrg * (since the last two src operands are in the same register bank): 192b8e80941Smrg * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 193b8e80941Smrg * 194b8e80941Smrg * 16 cycles on IVB, 14 on HSW 195b8e80941Smrg * (since the last two src operands are in different register banks): 196b8e80941Smrg * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 197b8e80941Smrg * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 198b8e80941Smrg * 199b8e80941Smrg * 16 cycles 200b8e80941Smrg * (since the last two src operands are in the same register bank): 201b8e80941Smrg * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 202b8e80941Smrg * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 203b8e80941Smrg */ 204b8e80941Smrg 205b8e80941Smrg /* Our register allocator doesn't know about register banks, so use the 206b8e80941Smrg * higher latency. 207b8e80941Smrg */ 208b8e80941Smrg latency = 14; 209b8e80941Smrg break; 210b8e80941Smrg 211b8e80941Smrg case SHADER_OPCODE_RCP: 212b8e80941Smrg case SHADER_OPCODE_RSQ: 213b8e80941Smrg case SHADER_OPCODE_SQRT: 214b8e80941Smrg case SHADER_OPCODE_LOG2: 215b8e80941Smrg case SHADER_OPCODE_EXP2: 216b8e80941Smrg case SHADER_OPCODE_SIN: 217b8e80941Smrg case SHADER_OPCODE_COS: 218b8e80941Smrg /* 2 cycles: 219b8e80941Smrg * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 220b8e80941Smrg * 221b8e80941Smrg * 18 cycles: 222b8e80941Smrg * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 223b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 224b8e80941Smrg * 225b8e80941Smrg * Same for exp2, log2, rsq, sqrt, sin, cos. 226b8e80941Smrg */ 227b8e80941Smrg latency = is_haswell ? 14 : 16; 228b8e80941Smrg break; 229b8e80941Smrg 230b8e80941Smrg case SHADER_OPCODE_POW: 231b8e80941Smrg /* 2 cycles: 232b8e80941Smrg * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 233b8e80941Smrg * 234b8e80941Smrg * 26 cycles: 235b8e80941Smrg * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 236b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 237b8e80941Smrg */ 238b8e80941Smrg latency = is_haswell ? 22 : 24; 239b8e80941Smrg break; 240b8e80941Smrg 241b8e80941Smrg case SHADER_OPCODE_TEX: 242b8e80941Smrg case SHADER_OPCODE_TXD: 243b8e80941Smrg case SHADER_OPCODE_TXF: 244b8e80941Smrg case SHADER_OPCODE_TXF_LZ: 245b8e80941Smrg case SHADER_OPCODE_TXL: 246b8e80941Smrg case SHADER_OPCODE_TXL_LZ: 247b8e80941Smrg /* 18 cycles: 248b8e80941Smrg * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 249b8e80941Smrg * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 250b8e80941Smrg * send(8) g4<1>UW g114<8,8,1>F 251b8e80941Smrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 252b8e80941Smrg * 253b8e80941Smrg * 697 +/-49 cycles (min 610, n=26): 254b8e80941Smrg * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 255b8e80941Smrg * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 256b8e80941Smrg * send(8) g4<1>UW g114<8,8,1>F 257b8e80941Smrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 258b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 259b8e80941Smrg * 260b8e80941Smrg * So the latency on our first texture load of the batchbuffer takes 261b8e80941Smrg * ~700 cycles, since the caches are cold at that point. 262b8e80941Smrg * 263b8e80941Smrg * 840 +/- 92 cycles (min 720, n=25): 264b8e80941Smrg * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 265b8e80941Smrg * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 266b8e80941Smrg * send(8) g4<1>UW g114<8,8,1>F 267b8e80941Smrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 268b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 269b8e80941Smrg * send(8) g4<1>UW g114<8,8,1>F 270b8e80941Smrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 271b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 272b8e80941Smrg * 273b8e80941Smrg * On the second load, it takes just an extra ~140 cycles, and after 274b8e80941Smrg * accounting for the 14 cycles of the MOV's latency, that makes ~130. 275b8e80941Smrg * 276b8e80941Smrg * 683 +/- 49 cycles (min = 602, n=47): 277b8e80941Smrg * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 278b8e80941Smrg * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 279b8e80941Smrg * send(8) g4<1>UW g114<8,8,1>F 280b8e80941Smrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 281b8e80941Smrg * send(8) g50<1>UW g114<8,8,1>F 282b8e80941Smrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 283b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 284b8e80941Smrg * 285b8e80941Smrg * The unit appears to be pipelined, since this matches up with the 286b8e80941Smrg * cache-cold case, despite there being two loads here. If you replace 287b8e80941Smrg * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). 288b8e80941Smrg * 289b8e80941Smrg * So, take some number between the cache-hot 140 cycles and the 290b8e80941Smrg * cache-cold 700 cycles. No particular tuning was done on this. 291b8e80941Smrg * 292b8e80941Smrg * I haven't done significant testing of the non-TEX opcodes. TXL at 293b8e80941Smrg * least looked about the same as TEX. 294b8e80941Smrg */ 295b8e80941Smrg latency = 200; 296b8e80941Smrg break; 297b8e80941Smrg 298b8e80941Smrg case SHADER_OPCODE_TXS: 299b8e80941Smrg /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 300b8e80941Smrg * cycles (n=15): 301b8e80941Smrg * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; 302b8e80941Smrg * send(8) g6<1>UW g114<8,8,1>F 303b8e80941Smrg * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; 304b8e80941Smrg * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; 305b8e80941Smrg * 306b8e80941Smrg * 307b8e80941Smrg * Two loads was 535 +/- 30 cycles (n=19): 308b8e80941Smrg * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 309b8e80941Smrg * send(16) g6<1>UW g114<8,8,1>F 310b8e80941Smrg * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 311b8e80941Smrg * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 312b8e80941Smrg * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; 313b8e80941Smrg * send(16) g8<1>UW g114<8,8,1>F 314b8e80941Smrg * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 315b8e80941Smrg * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; 316b8e80941Smrg * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; 317b8e80941Smrg * 318b8e80941Smrg * Since the only caches that should matter are just the 319b8e80941Smrg * instruction/state cache containing the surface state, assume that we 320b8e80941Smrg * always have hot caches. 321b8e80941Smrg */ 322b8e80941Smrg latency = 100; 323b8e80941Smrg break; 324b8e80941Smrg 325b8e80941Smrg case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: 326b8e80941Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 327b8e80941Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: 328b8e80941Smrg case VS_OPCODE_PULL_CONSTANT_LOAD: 329b8e80941Smrg /* testing using varying-index pull constants: 330b8e80941Smrg * 331b8e80941Smrg * 16 cycles: 332b8e80941Smrg * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 333b8e80941Smrg * send(8) g4<1>F g4<8,8,1>D 334b8e80941Smrg * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 335b8e80941Smrg * 336b8e80941Smrg * ~480 cycles: 337b8e80941Smrg * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 338b8e80941Smrg * send(8) g4<1>F g4<8,8,1>D 339b8e80941Smrg * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 340b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 341b8e80941Smrg * 342b8e80941Smrg * ~620 cycles: 343b8e80941Smrg * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 344b8e80941Smrg * send(8) g4<1>F g4<8,8,1>D 345b8e80941Smrg * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 346b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 347b8e80941Smrg * send(8) g4<1>F g4<8,8,1>D 348b8e80941Smrg * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 349b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 350b8e80941Smrg * 351b8e80941Smrg * So, if it's cache-hot, it's about 140. If it's cache cold, it's 352b8e80941Smrg * about 460. We expect to mostly be cache hot, so pick something more 353b8e80941Smrg * in that direction. 354b8e80941Smrg */ 355b8e80941Smrg latency = 200; 356b8e80941Smrg break; 357b8e80941Smrg 358b8e80941Smrg case SHADER_OPCODE_GEN7_SCRATCH_READ: 359b8e80941Smrg /* Testing a load from offset 0, that had been previously written: 360b8e80941Smrg * 361b8e80941Smrg * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q }; 362b8e80941Smrg * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q }; 363b8e80941Smrg * 364b8e80941Smrg * The cycles spent seemed to be grouped around 40-50 (as low as 38), 365b8e80941Smrg * then around 140. Presumably this is cache hit vs miss. 366b8e80941Smrg */ 367b8e80941Smrg latency = 50; 368b8e80941Smrg break; 369b8e80941Smrg 370b8e80941Smrg case VEC4_OPCODE_UNTYPED_ATOMIC: 371b8e80941Smrg /* See GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 372b8e80941Smrg latency = 14000; 373b8e80941Smrg break; 374b8e80941Smrg 375b8e80941Smrg case VEC4_OPCODE_UNTYPED_SURFACE_READ: 376b8e80941Smrg case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: 377b8e80941Smrg /* See also GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 378b8e80941Smrg latency = is_haswell ? 300 : 600; 379b8e80941Smrg break; 380b8e80941Smrg 381b8e80941Smrg case SHADER_OPCODE_SEND: 382b8e80941Smrg switch (inst->sfid) { 383b8e80941Smrg case BRW_SFID_SAMPLER: { 384b8e80941Smrg unsigned msg_type = (inst->desc >> 12) & 0x1f; 385b8e80941Smrg switch (msg_type) { 386b8e80941Smrg case GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO: 387b8e80941Smrg case GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO: 388b8e80941Smrg /* See also SHADER_OPCODE_TXS */ 389b8e80941Smrg latency = 100; 390b8e80941Smrg break; 391b8e80941Smrg 392b8e80941Smrg default: 393b8e80941Smrg /* See also SHADER_OPCODE_TEX */ 394b8e80941Smrg latency = 200; 395b8e80941Smrg break; 396b8e80941Smrg } 397b8e80941Smrg break; 398b8e80941Smrg } 399b8e80941Smrg 400b8e80941Smrg case GEN6_SFID_DATAPORT_RENDER_CACHE: 401b8e80941Smrg switch ((inst->desc >> 14) & 0x1f) { 402b8e80941Smrg case GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE: 403b8e80941Smrg case GEN7_DATAPORT_RC_TYPED_SURFACE_READ: 404b8e80941Smrg /* See also SHADER_OPCODE_TYPED_SURFACE_READ */ 405b8e80941Smrg assert(!is_haswell); 406b8e80941Smrg latency = 600; 407b8e80941Smrg break; 408b8e80941Smrg 409b8e80941Smrg case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP: 410b8e80941Smrg /* See also SHADER_OPCODE_TYPED_ATOMIC */ 411b8e80941Smrg assert(!is_haswell); 412b8e80941Smrg latency = 14000; 413b8e80941Smrg break; 414b8e80941Smrg 415b8e80941Smrg default: 416b8e80941Smrg unreachable("Unknown render cache message"); 417b8e80941Smrg } 418b8e80941Smrg break; 419b8e80941Smrg 420b8e80941Smrg case GEN7_SFID_DATAPORT_DATA_CACHE: 421b8e80941Smrg switch ((inst->desc >> 14) & 0x1f) { 422b8e80941Smrg case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ: 423b8e80941Smrg case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE: 424b8e80941Smrg /* We have no data for this but assume it's roughly the same as 425b8e80941Smrg * untyped surface read/write. 426b8e80941Smrg */ 427b8e80941Smrg latency = 300; 428b8e80941Smrg break; 429b8e80941Smrg 430b8e80941Smrg case GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ: 431b8e80941Smrg case GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE: 432b8e80941Smrg /* Test code: 433b8e80941Smrg * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 434b8e80941Smrg * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 435b8e80941Smrg * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 436b8e80941Smrg * send(8) g4<1>UD g112<8,8,1>UD 437b8e80941Smrg * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 438b8e80941Smrg * . 439b8e80941Smrg * . [repeats 8 times] 440b8e80941Smrg * . 441b8e80941Smrg * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 442b8e80941Smrg * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 443b8e80941Smrg * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 444b8e80941Smrg * send(8) g4<1>UD g112<8,8,1>UD 445b8e80941Smrg * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 446b8e80941Smrg * 447b8e80941Smrg * Running it 100 times as fragment shader on a 128x128 quad 448b8e80941Smrg * gives an average latency of 583 cycles per surface read, 449b8e80941Smrg * standard deviation 0.9%. 450b8e80941Smrg */ 451b8e80941Smrg assert(!is_haswell); 452b8e80941Smrg latency = 600; 453b8e80941Smrg break; 454b8e80941Smrg 455b8e80941Smrg case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP: 456b8e80941Smrg /* Test code: 457b8e80941Smrg * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; 458b8e80941Smrg * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; 459b8e80941Smrg * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; 460b8e80941Smrg * send(8) g4<1>ud g112<8,8,1>ud 461b8e80941Smrg * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; 462b8e80941Smrg * 463b8e80941Smrg * Running it 100 times as fragment shader on a 128x128 quad 464b8e80941Smrg * gives an average latency of 13867 cycles per atomic op, 465b8e80941Smrg * standard deviation 3%. Note that this is a rather 466b8e80941Smrg * pessimistic estimate, the actual latency in cases with few 467b8e80941Smrg * collisions between threads and favorable pipelining has been 468b8e80941Smrg * seen to be reduced by a factor of 100. 469b8e80941Smrg */ 470b8e80941Smrg assert(!is_haswell); 471b8e80941Smrg latency = 14000; 472b8e80941Smrg break; 473b8e80941Smrg 474b8e80941Smrg default: 475b8e80941Smrg unreachable("Unknown data cache message"); 476b8e80941Smrg } 477b8e80941Smrg break; 478b8e80941Smrg 479b8e80941Smrg case HSW_SFID_DATAPORT_DATA_CACHE_1: 480b8e80941Smrg switch ((inst->desc >> 14) & 0x1f) { 481b8e80941Smrg case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ: 482b8e80941Smrg case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: 483b8e80941Smrg case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: 484b8e80941Smrg case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: 485b8e80941Smrg case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE: 486b8e80941Smrg case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: 487b8e80941Smrg case GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE: 488b8e80941Smrg case GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ: 489b8e80941Smrg /* See also GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 490b8e80941Smrg latency = 300; 491b8e80941Smrg break; 492b8e80941Smrg 493b8e80941Smrg case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: 494b8e80941Smrg case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: 495b8e80941Smrg case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: 496b8e80941Smrg case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: 497b8e80941Smrg case GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP: 498b8e80941Smrg case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP: 499b8e80941Smrg case GEN9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP: 500b8e80941Smrg /* See also GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 501b8e80941Smrg latency = 14000; 502b8e80941Smrg break; 503b8e80941Smrg 504b8e80941Smrg default: 505b8e80941Smrg unreachable("Unknown data cache message"); 506b8e80941Smrg } 507b8e80941Smrg break; 508b8e80941Smrg 509b8e80941Smrg default: 510b8e80941Smrg unreachable("Unknown SFID"); 511b8e80941Smrg } 512b8e80941Smrg break; 513b8e80941Smrg 514b8e80941Smrg default: 515b8e80941Smrg /* 2 cycles: 516b8e80941Smrg * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 517b8e80941Smrg * 518b8e80941Smrg * 16 cycles: 519b8e80941Smrg * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 520b8e80941Smrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 521b8e80941Smrg */ 522b8e80941Smrg latency = 14; 523b8e80941Smrg break; 524b8e80941Smrg } 525b8e80941Smrg} 526b8e80941Smrg 527b8e80941Smrgclass instruction_scheduler { 528b8e80941Smrgpublic: 529b8e80941Smrg instruction_scheduler(backend_shader *s, int grf_count, 530b8e80941Smrg unsigned hw_reg_count, int block_count, 531b8e80941Smrg instruction_scheduler_mode mode) 532b8e80941Smrg { 533b8e80941Smrg this->bs = s; 534b8e80941Smrg this->mem_ctx = ralloc_context(NULL); 535b8e80941Smrg this->grf_count = grf_count; 536b8e80941Smrg this->hw_reg_count = hw_reg_count; 537b8e80941Smrg this->instructions.make_empty(); 538b8e80941Smrg this->instructions_to_schedule = 0; 539b8e80941Smrg this->post_reg_alloc = (mode == SCHEDULE_POST); 540b8e80941Smrg this->mode = mode; 541b8e80941Smrg if (!post_reg_alloc) { 542b8e80941Smrg this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); 543b8e80941Smrg 544b8e80941Smrg this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 545b8e80941Smrg for (int i = 0; i < block_count; i++) 546b8e80941Smrg this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, 547b8e80941Smrg BITSET_WORDS(grf_count)); 548b8e80941Smrg 549b8e80941Smrg this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 550b8e80941Smrg for (int i = 0; i < block_count; i++) 551b8e80941Smrg this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 552b8e80941Smrg BITSET_WORDS(grf_count)); 553b8e80941Smrg 554b8e80941Smrg this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 555b8e80941Smrg for (int i = 0; i < block_count; i++) 556b8e80941Smrg this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 557b8e80941Smrg BITSET_WORDS(hw_reg_count)); 558b8e80941Smrg 559b8e80941Smrg this->written = rzalloc_array(mem_ctx, bool, grf_count); 560b8e80941Smrg 561b8e80941Smrg this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); 562b8e80941Smrg 563b8e80941Smrg this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count); 564b8e80941Smrg } else { 565b8e80941Smrg this->reg_pressure_in = NULL; 566b8e80941Smrg this->livein = NULL; 567b8e80941Smrg this->liveout = NULL; 568b8e80941Smrg this->hw_liveout = NULL; 569b8e80941Smrg this->written = NULL; 570b8e80941Smrg this->reads_remaining = NULL; 571b8e80941Smrg this->hw_reads_remaining = NULL; 572b8e80941Smrg } 573b8e80941Smrg } 574b8e80941Smrg 575b8e80941Smrg ~instruction_scheduler() 576b8e80941Smrg { 577b8e80941Smrg ralloc_free(this->mem_ctx); 578b8e80941Smrg } 579b8e80941Smrg void add_barrier_deps(schedule_node *n); 580b8e80941Smrg void add_dep(schedule_node *before, schedule_node *after, int latency); 581b8e80941Smrg void add_dep(schedule_node *before, schedule_node *after); 582b8e80941Smrg 583b8e80941Smrg void run(cfg_t *cfg); 584b8e80941Smrg void add_insts_from_block(bblock_t *block); 585b8e80941Smrg void compute_delays(); 586b8e80941Smrg void compute_exits(); 587b8e80941Smrg virtual void calculate_deps() = 0; 588b8e80941Smrg virtual schedule_node *choose_instruction_to_schedule() = 0; 589b8e80941Smrg 590b8e80941Smrg /** 591b8e80941Smrg * Returns how many cycles it takes the instruction to issue. 592b8e80941Smrg * 593b8e80941Smrg * Instructions in gen hardware are handled one simd4 vector at a time, 594b8e80941Smrg * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2 595b8e80941Smrg * cycles to dispatch and SIMD16 (compressed) instructions take 4. 596b8e80941Smrg */ 597b8e80941Smrg virtual int issue_time(backend_instruction *inst) = 0; 598b8e80941Smrg 599b8e80941Smrg virtual void count_reads_remaining(backend_instruction *inst) = 0; 600b8e80941Smrg virtual void setup_liveness(cfg_t *cfg) = 0; 601b8e80941Smrg virtual void update_register_pressure(backend_instruction *inst) = 0; 602b8e80941Smrg virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; 603b8e80941Smrg 604b8e80941Smrg void schedule_instructions(bblock_t *block); 605b8e80941Smrg 606b8e80941Smrg void *mem_ctx; 607b8e80941Smrg 608b8e80941Smrg bool post_reg_alloc; 609b8e80941Smrg int instructions_to_schedule; 610b8e80941Smrg int grf_count; 611b8e80941Smrg unsigned hw_reg_count; 612b8e80941Smrg int reg_pressure; 613b8e80941Smrg int block_idx; 614b8e80941Smrg exec_list instructions; 615b8e80941Smrg backend_shader *bs; 616b8e80941Smrg 617b8e80941Smrg instruction_scheduler_mode mode; 618b8e80941Smrg 619b8e80941Smrg /* 620b8e80941Smrg * The register pressure at the beginning of each basic block. 621b8e80941Smrg */ 622b8e80941Smrg 623b8e80941Smrg int *reg_pressure_in; 624b8e80941Smrg 625b8e80941Smrg /* 626b8e80941Smrg * The virtual GRF's whose range overlaps the beginning of each basic block. 627b8e80941Smrg */ 628b8e80941Smrg 629b8e80941Smrg BITSET_WORD **livein; 630b8e80941Smrg 631b8e80941Smrg /* 632b8e80941Smrg * The virtual GRF's whose range overlaps the end of each basic block. 633b8e80941Smrg */ 634b8e80941Smrg 635b8e80941Smrg BITSET_WORD **liveout; 636b8e80941Smrg 637b8e80941Smrg /* 638b8e80941Smrg * The hardware GRF's whose range overlaps the end of each basic block. 639b8e80941Smrg */ 640b8e80941Smrg 641b8e80941Smrg BITSET_WORD **hw_liveout; 642b8e80941Smrg 643b8e80941Smrg /* 644b8e80941Smrg * Whether we've scheduled a write for this virtual GRF yet. 645b8e80941Smrg */ 646b8e80941Smrg 647b8e80941Smrg bool *written; 648b8e80941Smrg 649b8e80941Smrg /* 650b8e80941Smrg * How many reads we haven't scheduled for this virtual GRF yet. 651b8e80941Smrg */ 652b8e80941Smrg 653b8e80941Smrg int *reads_remaining; 654b8e80941Smrg 655b8e80941Smrg /* 656b8e80941Smrg * How many reads we haven't scheduled for this hardware GRF yet. 657b8e80941Smrg */ 658b8e80941Smrg 659b8e80941Smrg int *hw_reads_remaining; 660b8e80941Smrg}; 661b8e80941Smrg 662b8e80941Smrgclass fs_instruction_scheduler : public instruction_scheduler 663b8e80941Smrg{ 664b8e80941Smrgpublic: 665b8e80941Smrg fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count, 666b8e80941Smrg int block_count, 667b8e80941Smrg instruction_scheduler_mode mode); 668b8e80941Smrg void calculate_deps(); 669b8e80941Smrg bool is_compressed(fs_inst *inst); 670b8e80941Smrg schedule_node *choose_instruction_to_schedule(); 671b8e80941Smrg int issue_time(backend_instruction *inst); 672b8e80941Smrg fs_visitor *v; 673b8e80941Smrg 674b8e80941Smrg void count_reads_remaining(backend_instruction *inst); 675b8e80941Smrg void setup_liveness(cfg_t *cfg); 676b8e80941Smrg void update_register_pressure(backend_instruction *inst); 677b8e80941Smrg int get_register_pressure_benefit(backend_instruction *inst); 678b8e80941Smrg}; 679b8e80941Smrg 680b8e80941Smrgfs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v, 681b8e80941Smrg int grf_count, int hw_reg_count, 682b8e80941Smrg int block_count, 683b8e80941Smrg instruction_scheduler_mode mode) 684b8e80941Smrg : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), 685b8e80941Smrg v(v) 686b8e80941Smrg{ 687b8e80941Smrg} 688b8e80941Smrg 689b8e80941Smrgstatic bool 690b8e80941Smrgis_src_duplicate(fs_inst *inst, int src) 691b8e80941Smrg{ 692b8e80941Smrg for (int i = 0; i < src; i++) 693b8e80941Smrg if (inst->src[i].equals(inst->src[src])) 694b8e80941Smrg return true; 695b8e80941Smrg 696b8e80941Smrg return false; 697b8e80941Smrg} 698b8e80941Smrg 699b8e80941Smrgvoid 700b8e80941Smrgfs_instruction_scheduler::count_reads_remaining(backend_instruction *be) 701b8e80941Smrg{ 702b8e80941Smrg fs_inst *inst = (fs_inst *)be; 703b8e80941Smrg 704b8e80941Smrg if (!reads_remaining) 705b8e80941Smrg return; 706b8e80941Smrg 707b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 708b8e80941Smrg if (is_src_duplicate(inst, i)) 709b8e80941Smrg continue; 710b8e80941Smrg 711b8e80941Smrg if (inst->src[i].file == VGRF) { 712b8e80941Smrg reads_remaining[inst->src[i].nr]++; 713b8e80941Smrg } else if (inst->src[i].file == FIXED_GRF) { 714b8e80941Smrg if (inst->src[i].nr >= hw_reg_count) 715b8e80941Smrg continue; 716b8e80941Smrg 717b8e80941Smrg for (unsigned j = 0; j < regs_read(inst, i); j++) 718b8e80941Smrg hw_reads_remaining[inst->src[i].nr + j]++; 719b8e80941Smrg } 720b8e80941Smrg } 721b8e80941Smrg} 722b8e80941Smrg 723b8e80941Smrgvoid 724b8e80941Smrgfs_instruction_scheduler::setup_liveness(cfg_t *cfg) 725b8e80941Smrg{ 726b8e80941Smrg /* First, compute liveness on a per-GRF level using the in/out sets from 727b8e80941Smrg * liveness calculation. 728b8e80941Smrg */ 729b8e80941Smrg for (int block = 0; block < cfg->num_blocks; block++) { 730b8e80941Smrg for (int i = 0; i < v->live_intervals->num_vars; i++) { 731b8e80941Smrg if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) { 732b8e80941Smrg int vgrf = v->live_intervals->vgrf_from_var[i]; 733b8e80941Smrg if (!BITSET_TEST(livein[block], vgrf)) { 734b8e80941Smrg reg_pressure_in[block] += v->alloc.sizes[vgrf]; 735b8e80941Smrg BITSET_SET(livein[block], vgrf); 736b8e80941Smrg } 737b8e80941Smrg } 738b8e80941Smrg 739b8e80941Smrg if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i)) 740b8e80941Smrg BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]); 741b8e80941Smrg } 742b8e80941Smrg } 743b8e80941Smrg 744b8e80941Smrg /* Now, extend the live in/live out sets for when a range crosses a block 745b8e80941Smrg * boundary, which matches what our register allocator/interference code 746b8e80941Smrg * does to account for force_writemask_all and incompatible exec_mask's. 747b8e80941Smrg */ 748b8e80941Smrg for (int block = 0; block < cfg->num_blocks - 1; block++) { 749b8e80941Smrg for (int i = 0; i < grf_count; i++) { 750b8e80941Smrg if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip && 751b8e80941Smrg v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) { 752b8e80941Smrg if (!BITSET_TEST(livein[block + 1], i)) { 753b8e80941Smrg reg_pressure_in[block + 1] += v->alloc.sizes[i]; 754b8e80941Smrg BITSET_SET(livein[block + 1], i); 755b8e80941Smrg } 756b8e80941Smrg 757b8e80941Smrg BITSET_SET(liveout[block], i); 758b8e80941Smrg } 759b8e80941Smrg } 760b8e80941Smrg } 761b8e80941Smrg 762b8e80941Smrg int payload_last_use_ip[hw_reg_count]; 763b8e80941Smrg v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); 764b8e80941Smrg 765b8e80941Smrg for (unsigned i = 0; i < hw_reg_count; i++) { 766b8e80941Smrg if (payload_last_use_ip[i] == -1) 767b8e80941Smrg continue; 768b8e80941Smrg 769b8e80941Smrg for (int block = 0; block < cfg->num_blocks; block++) { 770b8e80941Smrg if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) 771b8e80941Smrg reg_pressure_in[block]++; 772b8e80941Smrg 773b8e80941Smrg if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) 774b8e80941Smrg BITSET_SET(hw_liveout[block], i); 775b8e80941Smrg } 776b8e80941Smrg } 777b8e80941Smrg} 778b8e80941Smrg 779b8e80941Smrgvoid 780b8e80941Smrgfs_instruction_scheduler::update_register_pressure(backend_instruction *be) 781b8e80941Smrg{ 782b8e80941Smrg fs_inst *inst = (fs_inst *)be; 783b8e80941Smrg 784b8e80941Smrg if (!reads_remaining) 785b8e80941Smrg return; 786b8e80941Smrg 787b8e80941Smrg if (inst->dst.file == VGRF) { 788b8e80941Smrg written[inst->dst.nr] = true; 789b8e80941Smrg } 790b8e80941Smrg 791b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 792b8e80941Smrg if (is_src_duplicate(inst, i)) 793b8e80941Smrg continue; 794b8e80941Smrg 795b8e80941Smrg if (inst->src[i].file == VGRF) { 796b8e80941Smrg reads_remaining[inst->src[i].nr]--; 797b8e80941Smrg } else if (inst->src[i].file == FIXED_GRF && 798b8e80941Smrg inst->src[i].nr < hw_reg_count) { 799b8e80941Smrg for (unsigned off = 0; off < regs_read(inst, i); off++) 800b8e80941Smrg hw_reads_remaining[inst->src[i].nr + off]--; 801b8e80941Smrg } 802b8e80941Smrg } 803b8e80941Smrg} 804b8e80941Smrg 805b8e80941Smrgint 806b8e80941Smrgfs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) 807b8e80941Smrg{ 808b8e80941Smrg fs_inst *inst = (fs_inst *)be; 809b8e80941Smrg int benefit = 0; 810b8e80941Smrg 811b8e80941Smrg if (inst->dst.file == VGRF) { 812b8e80941Smrg if (!BITSET_TEST(livein[block_idx], inst->dst.nr) && 813b8e80941Smrg !written[inst->dst.nr]) 814b8e80941Smrg benefit -= v->alloc.sizes[inst->dst.nr]; 815b8e80941Smrg } 816b8e80941Smrg 817b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 818b8e80941Smrg if (is_src_duplicate(inst, i)) 819b8e80941Smrg continue; 820b8e80941Smrg 821b8e80941Smrg if (inst->src[i].file == VGRF && 822b8e80941Smrg !BITSET_TEST(liveout[block_idx], inst->src[i].nr) && 823b8e80941Smrg reads_remaining[inst->src[i].nr] == 1) 824b8e80941Smrg benefit += v->alloc.sizes[inst->src[i].nr]; 825b8e80941Smrg 826b8e80941Smrg if (inst->src[i].file == FIXED_GRF && 827b8e80941Smrg inst->src[i].nr < hw_reg_count) { 828b8e80941Smrg for (unsigned off = 0; off < regs_read(inst, i); off++) { 829b8e80941Smrg int reg = inst->src[i].nr + off; 830b8e80941Smrg if (!BITSET_TEST(hw_liveout[block_idx], reg) && 831b8e80941Smrg hw_reads_remaining[reg] == 1) { 832b8e80941Smrg benefit++; 833b8e80941Smrg } 834b8e80941Smrg } 835b8e80941Smrg } 836b8e80941Smrg } 837b8e80941Smrg 838b8e80941Smrg return benefit; 839b8e80941Smrg} 840b8e80941Smrg 841b8e80941Smrgclass vec4_instruction_scheduler : public instruction_scheduler 842b8e80941Smrg{ 843b8e80941Smrgpublic: 844b8e80941Smrg vec4_instruction_scheduler(vec4_visitor *v, int grf_count); 845b8e80941Smrg void calculate_deps(); 846b8e80941Smrg schedule_node *choose_instruction_to_schedule(); 847b8e80941Smrg int issue_time(backend_instruction *inst); 848b8e80941Smrg vec4_visitor *v; 849b8e80941Smrg 850b8e80941Smrg void count_reads_remaining(backend_instruction *inst); 851b8e80941Smrg void setup_liveness(cfg_t *cfg); 852b8e80941Smrg void update_register_pressure(backend_instruction *inst); 853b8e80941Smrg int get_register_pressure_benefit(backend_instruction *inst); 854b8e80941Smrg}; 855b8e80941Smrg 856b8e80941Smrgvec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v, 857b8e80941Smrg int grf_count) 858b8e80941Smrg : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), 859b8e80941Smrg v(v) 860b8e80941Smrg{ 861b8e80941Smrg} 862b8e80941Smrg 863b8e80941Smrgvoid 864b8e80941Smrgvec4_instruction_scheduler::count_reads_remaining(backend_instruction *) 865b8e80941Smrg{ 866b8e80941Smrg} 867b8e80941Smrg 868b8e80941Smrgvoid 869b8e80941Smrgvec4_instruction_scheduler::setup_liveness(cfg_t *) 870b8e80941Smrg{ 871b8e80941Smrg} 872b8e80941Smrg 873b8e80941Smrgvoid 874b8e80941Smrgvec4_instruction_scheduler::update_register_pressure(backend_instruction *) 875b8e80941Smrg{ 876b8e80941Smrg} 877b8e80941Smrg 878b8e80941Smrgint 879b8e80941Smrgvec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *) 880b8e80941Smrg{ 881b8e80941Smrg return 0; 882b8e80941Smrg} 883b8e80941Smrg 884b8e80941Smrgschedule_node::schedule_node(backend_instruction *inst, 885b8e80941Smrg instruction_scheduler *sched) 886b8e80941Smrg{ 887b8e80941Smrg const struct gen_device_info *devinfo = sched->bs->devinfo; 888b8e80941Smrg 889b8e80941Smrg this->inst = inst; 890b8e80941Smrg this->child_array_size = 0; 891b8e80941Smrg this->children = NULL; 892b8e80941Smrg this->child_latency = NULL; 893b8e80941Smrg this->child_count = 0; 894b8e80941Smrg this->parent_count = 0; 895b8e80941Smrg this->unblocked_time = 0; 896b8e80941Smrg this->cand_generation = 0; 897b8e80941Smrg this->delay = 0; 898b8e80941Smrg this->exit = NULL; 899b8e80941Smrg 900b8e80941Smrg /* We can't measure Gen6 timings directly but expect them to be much 901b8e80941Smrg * closer to Gen7 than Gen4. 902b8e80941Smrg */ 903b8e80941Smrg if (!sched->post_reg_alloc) 904b8e80941Smrg this->latency = 1; 905b8e80941Smrg else if (devinfo->gen >= 6) 906b8e80941Smrg set_latency_gen7(devinfo->is_haswell); 907b8e80941Smrg else 908b8e80941Smrg set_latency_gen4(); 909b8e80941Smrg} 910b8e80941Smrg 911b8e80941Smrgvoid 912b8e80941Smrginstruction_scheduler::add_insts_from_block(bblock_t *block) 913b8e80941Smrg{ 914b8e80941Smrg foreach_inst_in_block(backend_instruction, inst, block) { 915b8e80941Smrg schedule_node *n = new(mem_ctx) schedule_node(inst, this); 916b8e80941Smrg 917b8e80941Smrg instructions.push_tail(n); 918b8e80941Smrg } 919b8e80941Smrg 920b8e80941Smrg this->instructions_to_schedule = block->end_ip - block->start_ip + 1; 921b8e80941Smrg} 922b8e80941Smrg 923b8e80941Smrg/** Computation of the delay member of each node. */ 924b8e80941Smrgvoid 925b8e80941Smrginstruction_scheduler::compute_delays() 926b8e80941Smrg{ 927b8e80941Smrg foreach_in_list_reverse(schedule_node, n, &instructions) { 928b8e80941Smrg if (!n->child_count) { 929b8e80941Smrg n->delay = issue_time(n->inst); 930b8e80941Smrg } else { 931b8e80941Smrg for (int i = 0; i < n->child_count; i++) { 932b8e80941Smrg assert(n->children[i]->delay); 933b8e80941Smrg n->delay = MAX2(n->delay, n->latency + n->children[i]->delay); 934b8e80941Smrg } 935b8e80941Smrg } 936b8e80941Smrg } 937b8e80941Smrg} 938b8e80941Smrg 939b8e80941Smrgvoid 940b8e80941Smrginstruction_scheduler::compute_exits() 941b8e80941Smrg{ 942b8e80941Smrg /* Calculate a lower bound of the scheduling time of each node in the 943b8e80941Smrg * graph. This is analogous to the node's critical path but calculated 944b8e80941Smrg * from the top instead of from the bottom of the block. 945b8e80941Smrg */ 946b8e80941Smrg foreach_in_list(schedule_node, n, &instructions) { 947b8e80941Smrg for (int i = 0; i < n->child_count; i++) { 948b8e80941Smrg n->children[i]->unblocked_time = 949b8e80941Smrg MAX2(n->children[i]->unblocked_time, 950b8e80941Smrg n->unblocked_time + issue_time(n->inst) + n->child_latency[i]); 951b8e80941Smrg } 952b8e80941Smrg } 953b8e80941Smrg 954b8e80941Smrg /* Calculate the exit of each node by induction based on the exit nodes of 955b8e80941Smrg * its children. The preferred exit of a node is the one among the exit 956b8e80941Smrg * nodes of its children which can be unblocked first according to the 957b8e80941Smrg * optimistic unblocked time estimate calculated above. 958b8e80941Smrg */ 959b8e80941Smrg foreach_in_list_reverse(schedule_node, n, &instructions) { 960b8e80941Smrg n->exit = (n->inst->opcode == FS_OPCODE_DISCARD_JUMP ? n : NULL); 961b8e80941Smrg 962b8e80941Smrg for (int i = 0; i < n->child_count; i++) { 963b8e80941Smrg if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n)) 964b8e80941Smrg n->exit = n->children[i]->exit; 965b8e80941Smrg } 966b8e80941Smrg } 967b8e80941Smrg} 968b8e80941Smrg 969b8e80941Smrg/** 970b8e80941Smrg * Add a dependency between two instruction nodes. 971b8e80941Smrg * 972b8e80941Smrg * The @after node will be scheduled after @before. We will try to 973b8e80941Smrg * schedule it @latency cycles after @before, but no guarantees there. 974b8e80941Smrg */ 975b8e80941Smrgvoid 976b8e80941Smrginstruction_scheduler::add_dep(schedule_node *before, schedule_node *after, 977b8e80941Smrg int latency) 978b8e80941Smrg{ 979b8e80941Smrg if (!before || !after) 980b8e80941Smrg return; 981b8e80941Smrg 982b8e80941Smrg assert(before != after); 983b8e80941Smrg 984b8e80941Smrg for (int i = 0; i < before->child_count; i++) { 985b8e80941Smrg if (before->children[i] == after) { 986b8e80941Smrg before->child_latency[i] = MAX2(before->child_latency[i], latency); 987b8e80941Smrg return; 988b8e80941Smrg } 989b8e80941Smrg } 990b8e80941Smrg 991b8e80941Smrg if (before->child_array_size <= before->child_count) { 992b8e80941Smrg if (before->child_array_size < 16) 993b8e80941Smrg before->child_array_size = 16; 994b8e80941Smrg else 995b8e80941Smrg before->child_array_size *= 2; 996b8e80941Smrg 997b8e80941Smrg before->children = reralloc(mem_ctx, before->children, 998b8e80941Smrg schedule_node *, 999b8e80941Smrg before->child_array_size); 1000b8e80941Smrg before->child_latency = reralloc(mem_ctx, before->child_latency, 1001b8e80941Smrg int, before->child_array_size); 1002b8e80941Smrg } 1003b8e80941Smrg 1004b8e80941Smrg before->children[before->child_count] = after; 1005b8e80941Smrg before->child_latency[before->child_count] = latency; 1006b8e80941Smrg before->child_count++; 1007b8e80941Smrg after->parent_count++; 1008b8e80941Smrg} 1009b8e80941Smrg 1010b8e80941Smrgvoid 1011b8e80941Smrginstruction_scheduler::add_dep(schedule_node *before, schedule_node *after) 1012b8e80941Smrg{ 1013b8e80941Smrg if (!before) 1014b8e80941Smrg return; 1015b8e80941Smrg 1016b8e80941Smrg add_dep(before, after, before->latency); 1017b8e80941Smrg} 1018b8e80941Smrg 1019b8e80941Smrgstatic bool 1020b8e80941Smrgis_scheduling_barrier(const backend_instruction *inst) 1021b8e80941Smrg{ 1022b8e80941Smrg return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT || 1023b8e80941Smrg inst->is_control_flow() || 1024b8e80941Smrg inst->has_side_effects(); 1025b8e80941Smrg} 1026b8e80941Smrg 1027b8e80941Smrg/** 1028b8e80941Smrg * Sometimes we really want this node to execute after everything that 1029b8e80941Smrg * was before it and before everything that followed it. This adds 1030b8e80941Smrg * the deps to do so. 1031b8e80941Smrg */ 1032b8e80941Smrgvoid 1033b8e80941Smrginstruction_scheduler::add_barrier_deps(schedule_node *n) 1034b8e80941Smrg{ 1035b8e80941Smrg schedule_node *prev = (schedule_node *)n->prev; 1036b8e80941Smrg schedule_node *next = (schedule_node *)n->next; 1037b8e80941Smrg 1038b8e80941Smrg if (prev) { 1039b8e80941Smrg while (!prev->is_head_sentinel()) { 1040b8e80941Smrg add_dep(prev, n, 0); 1041b8e80941Smrg if (is_scheduling_barrier(prev->inst)) 1042b8e80941Smrg break; 1043b8e80941Smrg prev = (schedule_node *)prev->prev; 1044b8e80941Smrg } 1045b8e80941Smrg } 1046b8e80941Smrg 1047b8e80941Smrg if (next) { 1048b8e80941Smrg while (!next->is_tail_sentinel()) { 1049b8e80941Smrg add_dep(n, next, 0); 1050b8e80941Smrg if (is_scheduling_barrier(next->inst)) 1051b8e80941Smrg break; 1052b8e80941Smrg next = (schedule_node *)next->next; 1053b8e80941Smrg } 1054b8e80941Smrg } 1055b8e80941Smrg} 1056b8e80941Smrg 1057b8e80941Smrg/* instruction scheduling needs to be aware of when an MRF write 1058b8e80941Smrg * actually writes 2 MRFs. 1059b8e80941Smrg */ 1060b8e80941Smrgbool 1061b8e80941Smrgfs_instruction_scheduler::is_compressed(fs_inst *inst) 1062b8e80941Smrg{ 1063b8e80941Smrg return inst->exec_size == 16; 1064b8e80941Smrg} 1065b8e80941Smrg 1066b8e80941Smrgvoid 1067b8e80941Smrgfs_instruction_scheduler::calculate_deps() 1068b8e80941Smrg{ 1069b8e80941Smrg /* Pre-register-allocation, this tracks the last write per VGRF offset. 1070b8e80941Smrg * After register allocation, reg_offsets are gone and we track individual 1071b8e80941Smrg * GRF registers. 1072b8e80941Smrg */ 1073b8e80941Smrg schedule_node **last_grf_write; 1074b8e80941Smrg schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)]; 1075b8e80941Smrg schedule_node *last_conditional_mod[8] = {}; 1076b8e80941Smrg schedule_node *last_accumulator_write = NULL; 1077b8e80941Smrg /* Fixed HW registers are assumed to be separate from the virtual 1078b8e80941Smrg * GRFs, so they can be tracked separately. We don't really write 1079b8e80941Smrg * to fixed GRFs much, so don't bother tracking them on a more 1080b8e80941Smrg * granular level. 1081b8e80941Smrg */ 1082b8e80941Smrg schedule_node *last_fixed_grf_write = NULL; 1083b8e80941Smrg 1084b8e80941Smrg last_grf_write = (schedule_node **)calloc(sizeof(schedule_node *), grf_count * 16); 1085b8e80941Smrg memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1086b8e80941Smrg 1087b8e80941Smrg /* top-to-bottom dependencies: RAW and WAW. */ 1088b8e80941Smrg foreach_in_list(schedule_node, n, &instructions) { 1089b8e80941Smrg fs_inst *inst = (fs_inst *)n->inst; 1090b8e80941Smrg 1091b8e80941Smrg if (is_scheduling_barrier(inst)) 1092b8e80941Smrg add_barrier_deps(n); 1093b8e80941Smrg 1094b8e80941Smrg /* read-after-write deps. */ 1095b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 1096b8e80941Smrg if (inst->src[i].file == VGRF) { 1097b8e80941Smrg if (post_reg_alloc) { 1098b8e80941Smrg for (unsigned r = 0; r < regs_read(inst, i); r++) 1099b8e80941Smrg add_dep(last_grf_write[inst->src[i].nr + r], n); 1100b8e80941Smrg } else { 1101b8e80941Smrg for (unsigned r = 0; r < regs_read(inst, i); r++) { 1102b8e80941Smrg add_dep(last_grf_write[inst->src[i].nr * 16 + 1103b8e80941Smrg inst->src[i].offset / REG_SIZE + r], n); 1104b8e80941Smrg } 1105b8e80941Smrg } 1106b8e80941Smrg } else if (inst->src[i].file == FIXED_GRF) { 1107b8e80941Smrg if (post_reg_alloc) { 1108b8e80941Smrg for (unsigned r = 0; r < regs_read(inst, i); r++) 1109b8e80941Smrg add_dep(last_grf_write[inst->src[i].nr + r], n); 1110b8e80941Smrg } else { 1111b8e80941Smrg add_dep(last_fixed_grf_write, n); 1112b8e80941Smrg } 1113b8e80941Smrg } else if (inst->src[i].is_accumulator()) { 1114b8e80941Smrg add_dep(last_accumulator_write, n); 1115b8e80941Smrg } else if (inst->src[i].file == ARF) { 1116b8e80941Smrg add_barrier_deps(n); 1117b8e80941Smrg } 1118b8e80941Smrg } 1119b8e80941Smrg 1120b8e80941Smrg if (inst->base_mrf != -1) { 1121b8e80941Smrg for (int i = 0; i < inst->mlen; i++) { 1122b8e80941Smrg /* It looks like the MRF regs are released in the send 1123b8e80941Smrg * instruction once it's sent, not when the result comes 1124b8e80941Smrg * back. 1125b8e80941Smrg */ 1126b8e80941Smrg add_dep(last_mrf_write[inst->base_mrf + i], n); 1127b8e80941Smrg } 1128b8e80941Smrg } 1129b8e80941Smrg 1130b8e80941Smrg if (const unsigned mask = inst->flags_read(v->devinfo)) { 1131b8e80941Smrg assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1132b8e80941Smrg 1133b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1134b8e80941Smrg if (mask & (1 << i)) 1135b8e80941Smrg add_dep(last_conditional_mod[i], n); 1136b8e80941Smrg } 1137b8e80941Smrg } 1138b8e80941Smrg 1139b8e80941Smrg if (inst->reads_accumulator_implicitly()) { 1140b8e80941Smrg add_dep(last_accumulator_write, n); 1141b8e80941Smrg } 1142b8e80941Smrg 1143b8e80941Smrg /* write-after-write deps. */ 1144b8e80941Smrg if (inst->dst.file == VGRF) { 1145b8e80941Smrg if (post_reg_alloc) { 1146b8e80941Smrg for (unsigned r = 0; r < regs_written(inst); r++) { 1147b8e80941Smrg add_dep(last_grf_write[inst->dst.nr + r], n); 1148b8e80941Smrg last_grf_write[inst->dst.nr + r] = n; 1149b8e80941Smrg } 1150b8e80941Smrg } else { 1151b8e80941Smrg for (unsigned r = 0; r < regs_written(inst); r++) { 1152b8e80941Smrg add_dep(last_grf_write[inst->dst.nr * 16 + 1153b8e80941Smrg inst->dst.offset / REG_SIZE + r], n); 1154b8e80941Smrg last_grf_write[inst->dst.nr * 16 + 1155b8e80941Smrg inst->dst.offset / REG_SIZE + r] = n; 1156b8e80941Smrg } 1157b8e80941Smrg } 1158b8e80941Smrg } else if (inst->dst.file == MRF) { 1159b8e80941Smrg int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1160b8e80941Smrg 1161b8e80941Smrg add_dep(last_mrf_write[reg], n); 1162b8e80941Smrg last_mrf_write[reg] = n; 1163b8e80941Smrg if (is_compressed(inst)) { 1164b8e80941Smrg if (inst->dst.nr & BRW_MRF_COMPR4) 1165b8e80941Smrg reg += 4; 1166b8e80941Smrg else 1167b8e80941Smrg reg++; 1168b8e80941Smrg add_dep(last_mrf_write[reg], n); 1169b8e80941Smrg last_mrf_write[reg] = n; 1170b8e80941Smrg } 1171b8e80941Smrg } else if (inst->dst.file == FIXED_GRF) { 1172b8e80941Smrg if (post_reg_alloc) { 1173b8e80941Smrg for (unsigned r = 0; r < regs_written(inst); r++) 1174b8e80941Smrg last_grf_write[inst->dst.nr + r] = n; 1175b8e80941Smrg } else { 1176b8e80941Smrg last_fixed_grf_write = n; 1177b8e80941Smrg } 1178b8e80941Smrg } else if (inst->dst.is_accumulator()) { 1179b8e80941Smrg add_dep(last_accumulator_write, n); 1180b8e80941Smrg last_accumulator_write = n; 1181b8e80941Smrg } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1182b8e80941Smrg add_barrier_deps(n); 1183b8e80941Smrg } 1184b8e80941Smrg 1185b8e80941Smrg if (inst->mlen > 0 && inst->base_mrf != -1) { 1186b8e80941Smrg for (int i = 0; i < v->implied_mrf_writes(inst); i++) { 1187b8e80941Smrg add_dep(last_mrf_write[inst->base_mrf + i], n); 1188b8e80941Smrg last_mrf_write[inst->base_mrf + i] = n; 1189b8e80941Smrg } 1190b8e80941Smrg } 1191b8e80941Smrg 1192b8e80941Smrg if (const unsigned mask = inst->flags_written()) { 1193b8e80941Smrg assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1194b8e80941Smrg 1195b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1196b8e80941Smrg if (mask & (1 << i)) { 1197b8e80941Smrg add_dep(last_conditional_mod[i], n, 0); 1198b8e80941Smrg last_conditional_mod[i] = n; 1199b8e80941Smrg } 1200b8e80941Smrg } 1201b8e80941Smrg } 1202b8e80941Smrg 1203b8e80941Smrg if (inst->writes_accumulator_implicitly(v->devinfo) && 1204b8e80941Smrg !inst->dst.is_accumulator()) { 1205b8e80941Smrg add_dep(last_accumulator_write, n); 1206b8e80941Smrg last_accumulator_write = n; 1207b8e80941Smrg } 1208b8e80941Smrg } 1209b8e80941Smrg 1210b8e80941Smrg /* bottom-to-top dependencies: WAR */ 1211b8e80941Smrg memset(last_grf_write, 0, sizeof(schedule_node *) * grf_count * 16); 1212b8e80941Smrg memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1213b8e80941Smrg memset(last_conditional_mod, 0, sizeof(last_conditional_mod)); 1214b8e80941Smrg last_accumulator_write = NULL; 1215b8e80941Smrg last_fixed_grf_write = NULL; 1216b8e80941Smrg 1217b8e80941Smrg foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1218b8e80941Smrg fs_inst *inst = (fs_inst *)n->inst; 1219b8e80941Smrg 1220b8e80941Smrg /* write-after-read deps. */ 1221b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 1222b8e80941Smrg if (inst->src[i].file == VGRF) { 1223b8e80941Smrg if (post_reg_alloc) { 1224b8e80941Smrg for (unsigned r = 0; r < regs_read(inst, i); r++) 1225b8e80941Smrg add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1226b8e80941Smrg } else { 1227b8e80941Smrg for (unsigned r = 0; r < regs_read(inst, i); r++) { 1228b8e80941Smrg add_dep(n, last_grf_write[inst->src[i].nr * 16 + 1229b8e80941Smrg inst->src[i].offset / REG_SIZE + r], 0); 1230b8e80941Smrg } 1231b8e80941Smrg } 1232b8e80941Smrg } else if (inst->src[i].file == FIXED_GRF) { 1233b8e80941Smrg if (post_reg_alloc) { 1234b8e80941Smrg for (unsigned r = 0; r < regs_read(inst, i); r++) 1235b8e80941Smrg add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 1236b8e80941Smrg } else { 1237b8e80941Smrg add_dep(n, last_fixed_grf_write, 0); 1238b8e80941Smrg } 1239b8e80941Smrg } else if (inst->src[i].is_accumulator()) { 1240b8e80941Smrg add_dep(n, last_accumulator_write, 0); 1241b8e80941Smrg } else if (inst->src[i].file == ARF) { 1242b8e80941Smrg add_barrier_deps(n); 1243b8e80941Smrg } 1244b8e80941Smrg } 1245b8e80941Smrg 1246b8e80941Smrg if (inst->base_mrf != -1) { 1247b8e80941Smrg for (int i = 0; i < inst->mlen; i++) { 1248b8e80941Smrg /* It looks like the MRF regs are released in the send 1249b8e80941Smrg * instruction once it's sent, not when the result comes 1250b8e80941Smrg * back. 1251b8e80941Smrg */ 1252b8e80941Smrg add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1253b8e80941Smrg } 1254b8e80941Smrg } 1255b8e80941Smrg 1256b8e80941Smrg if (const unsigned mask = inst->flags_read(v->devinfo)) { 1257b8e80941Smrg assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1258b8e80941Smrg 1259b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1260b8e80941Smrg if (mask & (1 << i)) 1261b8e80941Smrg add_dep(n, last_conditional_mod[i]); 1262b8e80941Smrg } 1263b8e80941Smrg } 1264b8e80941Smrg 1265b8e80941Smrg if (inst->reads_accumulator_implicitly()) { 1266b8e80941Smrg add_dep(n, last_accumulator_write); 1267b8e80941Smrg } 1268b8e80941Smrg 1269b8e80941Smrg /* Update the things this instruction wrote, so earlier reads 1270b8e80941Smrg * can mark this as WAR dependency. 1271b8e80941Smrg */ 1272b8e80941Smrg if (inst->dst.file == VGRF) { 1273b8e80941Smrg if (post_reg_alloc) { 1274b8e80941Smrg for (unsigned r = 0; r < regs_written(inst); r++) 1275b8e80941Smrg last_grf_write[inst->dst.nr + r] = n; 1276b8e80941Smrg } else { 1277b8e80941Smrg for (unsigned r = 0; r < regs_written(inst); r++) { 1278b8e80941Smrg last_grf_write[inst->dst.nr * 16 + 1279b8e80941Smrg inst->dst.offset / REG_SIZE + r] = n; 1280b8e80941Smrg } 1281b8e80941Smrg } 1282b8e80941Smrg } else if (inst->dst.file == MRF) { 1283b8e80941Smrg int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 1284b8e80941Smrg 1285b8e80941Smrg last_mrf_write[reg] = n; 1286b8e80941Smrg 1287b8e80941Smrg if (is_compressed(inst)) { 1288b8e80941Smrg if (inst->dst.nr & BRW_MRF_COMPR4) 1289b8e80941Smrg reg += 4; 1290b8e80941Smrg else 1291b8e80941Smrg reg++; 1292b8e80941Smrg 1293b8e80941Smrg last_mrf_write[reg] = n; 1294b8e80941Smrg } 1295b8e80941Smrg } else if (inst->dst.file == FIXED_GRF) { 1296b8e80941Smrg if (post_reg_alloc) { 1297b8e80941Smrg for (unsigned r = 0; r < regs_written(inst); r++) 1298b8e80941Smrg last_grf_write[inst->dst.nr + r] = n; 1299b8e80941Smrg } else { 1300b8e80941Smrg last_fixed_grf_write = n; 1301b8e80941Smrg } 1302b8e80941Smrg } else if (inst->dst.is_accumulator()) { 1303b8e80941Smrg last_accumulator_write = n; 1304b8e80941Smrg } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1305b8e80941Smrg add_barrier_deps(n); 1306b8e80941Smrg } 1307b8e80941Smrg 1308b8e80941Smrg if (inst->mlen > 0 && inst->base_mrf != -1) { 1309b8e80941Smrg for (int i = 0; i < v->implied_mrf_writes(inst); i++) { 1310b8e80941Smrg last_mrf_write[inst->base_mrf + i] = n; 1311b8e80941Smrg } 1312b8e80941Smrg } 1313b8e80941Smrg 1314b8e80941Smrg if (const unsigned mask = inst->flags_written()) { 1315b8e80941Smrg assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 1316b8e80941Smrg 1317b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 1318b8e80941Smrg if (mask & (1 << i)) 1319b8e80941Smrg last_conditional_mod[i] = n; 1320b8e80941Smrg } 1321b8e80941Smrg } 1322b8e80941Smrg 1323b8e80941Smrg if (inst->writes_accumulator_implicitly(v->devinfo)) { 1324b8e80941Smrg last_accumulator_write = n; 1325b8e80941Smrg } 1326b8e80941Smrg } 1327b8e80941Smrg 1328b8e80941Smrg free(last_grf_write); 1329b8e80941Smrg} 1330b8e80941Smrg 1331b8e80941Smrgvoid 1332b8e80941Smrgvec4_instruction_scheduler::calculate_deps() 1333b8e80941Smrg{ 1334b8e80941Smrg schedule_node *last_grf_write[grf_count]; 1335b8e80941Smrg schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)]; 1336b8e80941Smrg schedule_node *last_conditional_mod = NULL; 1337b8e80941Smrg schedule_node *last_accumulator_write = NULL; 1338b8e80941Smrg /* Fixed HW registers are assumed to be separate from the virtual 1339b8e80941Smrg * GRFs, so they can be tracked separately. We don't really write 1340b8e80941Smrg * to fixed GRFs much, so don't bother tracking them on a more 1341b8e80941Smrg * granular level. 1342b8e80941Smrg */ 1343b8e80941Smrg schedule_node *last_fixed_grf_write = NULL; 1344b8e80941Smrg 1345b8e80941Smrg memset(last_grf_write, 0, sizeof(last_grf_write)); 1346b8e80941Smrg memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1347b8e80941Smrg 1348b8e80941Smrg /* top-to-bottom dependencies: RAW and WAW. */ 1349b8e80941Smrg foreach_in_list(schedule_node, n, &instructions) { 1350b8e80941Smrg vec4_instruction *inst = (vec4_instruction *)n->inst; 1351b8e80941Smrg 1352b8e80941Smrg if (is_scheduling_barrier(inst)) 1353b8e80941Smrg add_barrier_deps(n); 1354b8e80941Smrg 1355b8e80941Smrg /* read-after-write deps. */ 1356b8e80941Smrg for (int i = 0; i < 3; i++) { 1357b8e80941Smrg if (inst->src[i].file == VGRF) { 1358b8e80941Smrg for (unsigned j = 0; j < regs_read(inst, i); ++j) 1359b8e80941Smrg add_dep(last_grf_write[inst->src[i].nr + j], n); 1360b8e80941Smrg } else if (inst->src[i].file == FIXED_GRF) { 1361b8e80941Smrg add_dep(last_fixed_grf_write, n); 1362b8e80941Smrg } else if (inst->src[i].is_accumulator()) { 1363b8e80941Smrg assert(last_accumulator_write); 1364b8e80941Smrg add_dep(last_accumulator_write, n); 1365b8e80941Smrg } else if (inst->src[i].file == ARF) { 1366b8e80941Smrg add_barrier_deps(n); 1367b8e80941Smrg } 1368b8e80941Smrg } 1369b8e80941Smrg 1370b8e80941Smrg if (inst->reads_g0_implicitly()) 1371b8e80941Smrg add_dep(last_fixed_grf_write, n); 1372b8e80941Smrg 1373b8e80941Smrg if (!inst->is_send_from_grf()) { 1374b8e80941Smrg for (int i = 0; i < inst->mlen; i++) { 1375b8e80941Smrg /* It looks like the MRF regs are released in the send 1376b8e80941Smrg * instruction once it's sent, not when the result comes 1377b8e80941Smrg * back. 1378b8e80941Smrg */ 1379b8e80941Smrg add_dep(last_mrf_write[inst->base_mrf + i], n); 1380b8e80941Smrg } 1381b8e80941Smrg } 1382b8e80941Smrg 1383b8e80941Smrg if (inst->reads_flag()) { 1384b8e80941Smrg assert(last_conditional_mod); 1385b8e80941Smrg add_dep(last_conditional_mod, n); 1386b8e80941Smrg } 1387b8e80941Smrg 1388b8e80941Smrg if (inst->reads_accumulator_implicitly()) { 1389b8e80941Smrg assert(last_accumulator_write); 1390b8e80941Smrg add_dep(last_accumulator_write, n); 1391b8e80941Smrg } 1392b8e80941Smrg 1393b8e80941Smrg /* write-after-write deps. */ 1394b8e80941Smrg if (inst->dst.file == VGRF) { 1395b8e80941Smrg for (unsigned j = 0; j < regs_written(inst); ++j) { 1396b8e80941Smrg add_dep(last_grf_write[inst->dst.nr + j], n); 1397b8e80941Smrg last_grf_write[inst->dst.nr + j] = n; 1398b8e80941Smrg } 1399b8e80941Smrg } else if (inst->dst.file == MRF) { 1400b8e80941Smrg add_dep(last_mrf_write[inst->dst.nr], n); 1401b8e80941Smrg last_mrf_write[inst->dst.nr] = n; 1402b8e80941Smrg } else if (inst->dst.file == FIXED_GRF) { 1403b8e80941Smrg last_fixed_grf_write = n; 1404b8e80941Smrg } else if (inst->dst.is_accumulator()) { 1405b8e80941Smrg add_dep(last_accumulator_write, n); 1406b8e80941Smrg last_accumulator_write = n; 1407b8e80941Smrg } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1408b8e80941Smrg add_barrier_deps(n); 1409b8e80941Smrg } 1410b8e80941Smrg 1411b8e80941Smrg if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1412b8e80941Smrg for (int i = 0; i < v->implied_mrf_writes(inst); i++) { 1413b8e80941Smrg add_dep(last_mrf_write[inst->base_mrf + i], n); 1414b8e80941Smrg last_mrf_write[inst->base_mrf + i] = n; 1415b8e80941Smrg } 1416b8e80941Smrg } 1417b8e80941Smrg 1418b8e80941Smrg if (inst->writes_flag()) { 1419b8e80941Smrg add_dep(last_conditional_mod, n, 0); 1420b8e80941Smrg last_conditional_mod = n; 1421b8e80941Smrg } 1422b8e80941Smrg 1423b8e80941Smrg if (inst->writes_accumulator_implicitly(v->devinfo) && 1424b8e80941Smrg !inst->dst.is_accumulator()) { 1425b8e80941Smrg add_dep(last_accumulator_write, n); 1426b8e80941Smrg last_accumulator_write = n; 1427b8e80941Smrg } 1428b8e80941Smrg } 1429b8e80941Smrg 1430b8e80941Smrg /* bottom-to-top dependencies: WAR */ 1431b8e80941Smrg memset(last_grf_write, 0, sizeof(last_grf_write)); 1432b8e80941Smrg memset(last_mrf_write, 0, sizeof(last_mrf_write)); 1433b8e80941Smrg last_conditional_mod = NULL; 1434b8e80941Smrg last_accumulator_write = NULL; 1435b8e80941Smrg last_fixed_grf_write = NULL; 1436b8e80941Smrg 1437b8e80941Smrg foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 1438b8e80941Smrg vec4_instruction *inst = (vec4_instruction *)n->inst; 1439b8e80941Smrg 1440b8e80941Smrg /* write-after-read deps. */ 1441b8e80941Smrg for (int i = 0; i < 3; i++) { 1442b8e80941Smrg if (inst->src[i].file == VGRF) { 1443b8e80941Smrg for (unsigned j = 0; j < regs_read(inst, i); ++j) 1444b8e80941Smrg add_dep(n, last_grf_write[inst->src[i].nr + j]); 1445b8e80941Smrg } else if (inst->src[i].file == FIXED_GRF) { 1446b8e80941Smrg add_dep(n, last_fixed_grf_write); 1447b8e80941Smrg } else if (inst->src[i].is_accumulator()) { 1448b8e80941Smrg add_dep(n, last_accumulator_write); 1449b8e80941Smrg } else if (inst->src[i].file == ARF) { 1450b8e80941Smrg add_barrier_deps(n); 1451b8e80941Smrg } 1452b8e80941Smrg } 1453b8e80941Smrg 1454b8e80941Smrg if (!inst->is_send_from_grf()) { 1455b8e80941Smrg for (int i = 0; i < inst->mlen; i++) { 1456b8e80941Smrg /* It looks like the MRF regs are released in the send 1457b8e80941Smrg * instruction once it's sent, not when the result comes 1458b8e80941Smrg * back. 1459b8e80941Smrg */ 1460b8e80941Smrg add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 1461b8e80941Smrg } 1462b8e80941Smrg } 1463b8e80941Smrg 1464b8e80941Smrg if (inst->reads_flag()) { 1465b8e80941Smrg add_dep(n, last_conditional_mod); 1466b8e80941Smrg } 1467b8e80941Smrg 1468b8e80941Smrg if (inst->reads_accumulator_implicitly()) { 1469b8e80941Smrg add_dep(n, last_accumulator_write); 1470b8e80941Smrg } 1471b8e80941Smrg 1472b8e80941Smrg /* Update the things this instruction wrote, so earlier reads 1473b8e80941Smrg * can mark this as WAR dependency. 1474b8e80941Smrg */ 1475b8e80941Smrg if (inst->dst.file == VGRF) { 1476b8e80941Smrg for (unsigned j = 0; j < regs_written(inst); ++j) 1477b8e80941Smrg last_grf_write[inst->dst.nr + j] = n; 1478b8e80941Smrg } else if (inst->dst.file == MRF) { 1479b8e80941Smrg last_mrf_write[inst->dst.nr] = n; 1480b8e80941Smrg } else if (inst->dst.file == FIXED_GRF) { 1481b8e80941Smrg last_fixed_grf_write = n; 1482b8e80941Smrg } else if (inst->dst.is_accumulator()) { 1483b8e80941Smrg last_accumulator_write = n; 1484b8e80941Smrg } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 1485b8e80941Smrg add_barrier_deps(n); 1486b8e80941Smrg } 1487b8e80941Smrg 1488b8e80941Smrg if (inst->mlen > 0 && !inst->is_send_from_grf()) { 1489b8e80941Smrg for (int i = 0; i < v->implied_mrf_writes(inst); i++) { 1490b8e80941Smrg last_mrf_write[inst->base_mrf + i] = n; 1491b8e80941Smrg } 1492b8e80941Smrg } 1493b8e80941Smrg 1494b8e80941Smrg if (inst->writes_flag()) { 1495b8e80941Smrg last_conditional_mod = n; 1496b8e80941Smrg } 1497b8e80941Smrg 1498b8e80941Smrg if (inst->writes_accumulator_implicitly(v->devinfo)) { 1499b8e80941Smrg last_accumulator_write = n; 1500b8e80941Smrg } 1501b8e80941Smrg } 1502b8e80941Smrg} 1503b8e80941Smrg 1504b8e80941Smrgschedule_node * 1505b8e80941Smrgfs_instruction_scheduler::choose_instruction_to_schedule() 1506b8e80941Smrg{ 1507b8e80941Smrg schedule_node *chosen = NULL; 1508b8e80941Smrg 1509b8e80941Smrg if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) { 1510b8e80941Smrg int chosen_time = 0; 1511b8e80941Smrg 1512b8e80941Smrg /* Of the instructions ready to execute or the closest to being ready, 1513b8e80941Smrg * choose the one most likely to unblock an early program exit, or 1514b8e80941Smrg * otherwise the oldest one. 1515b8e80941Smrg */ 1516b8e80941Smrg foreach_in_list(schedule_node, n, &instructions) { 1517b8e80941Smrg if (!chosen || 1518b8e80941Smrg exit_unblocked_time(n) < exit_unblocked_time(chosen) || 1519b8e80941Smrg (exit_unblocked_time(n) == exit_unblocked_time(chosen) && 1520b8e80941Smrg n->unblocked_time < chosen_time)) { 1521b8e80941Smrg chosen = n; 1522b8e80941Smrg chosen_time = n->unblocked_time; 1523b8e80941Smrg } 1524b8e80941Smrg } 1525b8e80941Smrg } else { 1526b8e80941Smrg /* Before register allocation, we don't care about the latencies of 1527b8e80941Smrg * instructions. All we care about is reducing live intervals of 1528b8e80941Smrg * variables so that we can avoid register spilling, or get SIMD16 1529b8e80941Smrg * shaders which naturally do a better job of hiding instruction 1530b8e80941Smrg * latency. 1531b8e80941Smrg */ 1532b8e80941Smrg foreach_in_list(schedule_node, n, &instructions) { 1533b8e80941Smrg fs_inst *inst = (fs_inst *)n->inst; 1534b8e80941Smrg 1535b8e80941Smrg if (!chosen) { 1536b8e80941Smrg chosen = n; 1537b8e80941Smrg continue; 1538b8e80941Smrg } 1539b8e80941Smrg 1540b8e80941Smrg /* Most important: If we can definitely reduce register pressure, do 1541b8e80941Smrg * so immediately. 1542b8e80941Smrg */ 1543b8e80941Smrg int register_pressure_benefit = get_register_pressure_benefit(n->inst); 1544b8e80941Smrg int chosen_register_pressure_benefit = 1545b8e80941Smrg get_register_pressure_benefit(chosen->inst); 1546b8e80941Smrg 1547b8e80941Smrg if (register_pressure_benefit > 0 && 1548b8e80941Smrg register_pressure_benefit > chosen_register_pressure_benefit) { 1549b8e80941Smrg chosen = n; 1550b8e80941Smrg continue; 1551b8e80941Smrg } else if (chosen_register_pressure_benefit > 0 && 1552b8e80941Smrg (register_pressure_benefit < 1553b8e80941Smrg chosen_register_pressure_benefit)) { 1554b8e80941Smrg continue; 1555b8e80941Smrg } 1556b8e80941Smrg 1557b8e80941Smrg if (mode == SCHEDULE_PRE_LIFO) { 1558b8e80941Smrg /* Prefer instructions that recently became available for 1559b8e80941Smrg * scheduling. These are the things that are most likely to 1560b8e80941Smrg * (eventually) make a variable dead and reduce register pressure. 1561b8e80941Smrg * Typical register pressure estimates don't work for us because 1562b8e80941Smrg * most of our pressure comes from texturing, where no single 1563b8e80941Smrg * instruction to schedule will make a vec4 value dead. 1564b8e80941Smrg */ 1565b8e80941Smrg if (n->cand_generation > chosen->cand_generation) { 1566b8e80941Smrg chosen = n; 1567b8e80941Smrg continue; 1568b8e80941Smrg } else if (n->cand_generation < chosen->cand_generation) { 1569b8e80941Smrg continue; 1570b8e80941Smrg } 1571b8e80941Smrg 1572b8e80941Smrg /* On MRF-using chips, prefer non-SEND instructions. If we don't 1573b8e80941Smrg * do this, then because we prefer instructions that just became 1574b8e80941Smrg * candidates, we'll end up in a pattern of scheduling a SEND, 1575b8e80941Smrg * then the MRFs for the next SEND, then the next SEND, then the 1576b8e80941Smrg * MRFs, etc., without ever consuming the results of a send. 1577b8e80941Smrg */ 1578b8e80941Smrg if (v->devinfo->gen < 7) { 1579b8e80941Smrg fs_inst *chosen_inst = (fs_inst *)chosen->inst; 1580b8e80941Smrg 1581b8e80941Smrg /* We use size_written > 4 * exec_size as our test for the kind 1582b8e80941Smrg * of send instruction to avoid -- only sends generate many 1583b8e80941Smrg * regs, and a single-result send is probably actually reducing 1584b8e80941Smrg * register pressure. 1585b8e80941Smrg */ 1586b8e80941Smrg if (inst->size_written <= 4 * inst->exec_size && 1587b8e80941Smrg chosen_inst->size_written > 4 * chosen_inst->exec_size) { 1588b8e80941Smrg chosen = n; 1589b8e80941Smrg continue; 1590b8e80941Smrg } else if (inst->size_written > chosen_inst->size_written) { 1591b8e80941Smrg continue; 1592b8e80941Smrg } 1593b8e80941Smrg } 1594b8e80941Smrg } 1595b8e80941Smrg 1596b8e80941Smrg /* For instructions pushed on the cands list at the same time, prefer 1597b8e80941Smrg * the one with the highest delay to the end of the program. This is 1598b8e80941Smrg * most likely to have its values able to be consumed first (such as 1599b8e80941Smrg * for a large tree of lowered ubo loads, which appear reversed in 1600b8e80941Smrg * the instruction stream with respect to when they can be consumed). 1601b8e80941Smrg */ 1602b8e80941Smrg if (n->delay > chosen->delay) { 1603b8e80941Smrg chosen = n; 1604b8e80941Smrg continue; 1605b8e80941Smrg } else if (n->delay < chosen->delay) { 1606b8e80941Smrg continue; 1607b8e80941Smrg } 1608b8e80941Smrg 1609b8e80941Smrg /* Prefer the node most likely to unblock an early program exit. 1610b8e80941Smrg */ 1611b8e80941Smrg if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) { 1612b8e80941Smrg chosen = n; 1613b8e80941Smrg continue; 1614b8e80941Smrg } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) { 1615b8e80941Smrg continue; 1616b8e80941Smrg } 1617b8e80941Smrg 1618b8e80941Smrg /* If all other metrics are equal, we prefer the first instruction in 1619b8e80941Smrg * the list (program execution). 1620b8e80941Smrg */ 1621b8e80941Smrg } 1622b8e80941Smrg } 1623b8e80941Smrg 1624b8e80941Smrg return chosen; 1625b8e80941Smrg} 1626b8e80941Smrg 1627b8e80941Smrgschedule_node * 1628b8e80941Smrgvec4_instruction_scheduler::choose_instruction_to_schedule() 1629b8e80941Smrg{ 1630b8e80941Smrg schedule_node *chosen = NULL; 1631b8e80941Smrg int chosen_time = 0; 1632b8e80941Smrg 1633b8e80941Smrg /* Of the instructions ready to execute or the closest to being ready, 1634b8e80941Smrg * choose the oldest one. 1635b8e80941Smrg */ 1636b8e80941Smrg foreach_in_list(schedule_node, n, &instructions) { 1637b8e80941Smrg if (!chosen || n->unblocked_time < chosen_time) { 1638b8e80941Smrg chosen = n; 1639b8e80941Smrg chosen_time = n->unblocked_time; 1640b8e80941Smrg } 1641b8e80941Smrg } 1642b8e80941Smrg 1643b8e80941Smrg return chosen; 1644b8e80941Smrg} 1645b8e80941Smrg 1646b8e80941Smrgint 1647b8e80941Smrgfs_instruction_scheduler::issue_time(backend_instruction *inst) 1648b8e80941Smrg{ 1649b8e80941Smrg const unsigned overhead = v->bank_conflict_cycles((fs_inst *)inst); 1650b8e80941Smrg if (is_compressed((fs_inst *)inst)) 1651b8e80941Smrg return 4 + overhead; 1652b8e80941Smrg else 1653b8e80941Smrg return 2 + overhead; 1654b8e80941Smrg} 1655b8e80941Smrg 1656b8e80941Smrgint 1657b8e80941Smrgvec4_instruction_scheduler::issue_time(backend_instruction *) 1658b8e80941Smrg{ 1659b8e80941Smrg /* We always execute as two vec4s in parallel. */ 1660b8e80941Smrg return 2; 1661b8e80941Smrg} 1662b8e80941Smrg 1663b8e80941Smrgvoid 1664b8e80941Smrginstruction_scheduler::schedule_instructions(bblock_t *block) 1665b8e80941Smrg{ 1666b8e80941Smrg const struct gen_device_info *devinfo = bs->devinfo; 1667b8e80941Smrg int time = 0; 1668b8e80941Smrg if (!post_reg_alloc) 1669b8e80941Smrg reg_pressure = reg_pressure_in[block->num]; 1670b8e80941Smrg block_idx = block->num; 1671b8e80941Smrg 1672b8e80941Smrg /* Remove non-DAG heads from the list. */ 1673b8e80941Smrg foreach_in_list_safe(schedule_node, n, &instructions) { 1674b8e80941Smrg if (n->parent_count != 0) 1675b8e80941Smrg n->remove(); 1676b8e80941Smrg } 1677b8e80941Smrg 1678b8e80941Smrg unsigned cand_generation = 1; 1679b8e80941Smrg while (!instructions.is_empty()) { 1680b8e80941Smrg schedule_node *chosen = choose_instruction_to_schedule(); 1681b8e80941Smrg 1682b8e80941Smrg /* Schedule this instruction. */ 1683b8e80941Smrg assert(chosen); 1684b8e80941Smrg chosen->remove(); 1685b8e80941Smrg chosen->inst->exec_node::remove(); 1686b8e80941Smrg block->instructions.push_tail(chosen->inst); 1687b8e80941Smrg instructions_to_schedule--; 1688b8e80941Smrg 1689b8e80941Smrg if (!post_reg_alloc) { 1690b8e80941Smrg reg_pressure -= get_register_pressure_benefit(chosen->inst); 1691b8e80941Smrg update_register_pressure(chosen->inst); 1692b8e80941Smrg } 1693b8e80941Smrg 1694b8e80941Smrg /* If we expected a delay for scheduling, then bump the clock to reflect 1695b8e80941Smrg * that. In reality, the hardware will switch to another hyperthread 1696b8e80941Smrg * and may not return to dispatching our thread for a while even after 1697b8e80941Smrg * we're unblocked. After this, we have the time when the chosen 1698b8e80941Smrg * instruction will start executing. 1699b8e80941Smrg */ 1700b8e80941Smrg time = MAX2(time, chosen->unblocked_time); 1701b8e80941Smrg 1702b8e80941Smrg /* Update the clock for how soon an instruction could start after the 1703b8e80941Smrg * chosen one. 1704b8e80941Smrg */ 1705b8e80941Smrg time += issue_time(chosen->inst); 1706b8e80941Smrg 1707b8e80941Smrg if (debug) { 1708b8e80941Smrg fprintf(stderr, "clock %4d, scheduled: ", time); 1709b8e80941Smrg bs->dump_instruction(chosen->inst); 1710b8e80941Smrg if (!post_reg_alloc) 1711b8e80941Smrg fprintf(stderr, "(register pressure %d)\n", reg_pressure); 1712b8e80941Smrg } 1713b8e80941Smrg 1714b8e80941Smrg /* Now that we've scheduled a new instruction, some of its 1715b8e80941Smrg * children can be promoted to the list of instructions ready to 1716b8e80941Smrg * be scheduled. Update the children's unblocked time for this 1717b8e80941Smrg * DAG edge as we do so. 1718b8e80941Smrg */ 1719b8e80941Smrg for (int i = chosen->child_count - 1; i >= 0; i--) { 1720b8e80941Smrg schedule_node *child = chosen->children[i]; 1721b8e80941Smrg 1722b8e80941Smrg child->unblocked_time = MAX2(child->unblocked_time, 1723b8e80941Smrg time + chosen->child_latency[i]); 1724b8e80941Smrg 1725b8e80941Smrg if (debug) { 1726b8e80941Smrg fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count); 1727b8e80941Smrg bs->dump_instruction(child->inst); 1728b8e80941Smrg } 1729b8e80941Smrg 1730b8e80941Smrg child->cand_generation = cand_generation; 1731b8e80941Smrg child->parent_count--; 1732b8e80941Smrg if (child->parent_count == 0) { 1733b8e80941Smrg if (debug) { 1734b8e80941Smrg fprintf(stderr, "\t\tnow available\n"); 1735b8e80941Smrg } 1736b8e80941Smrg instructions.push_head(child); 1737b8e80941Smrg } 1738b8e80941Smrg } 1739b8e80941Smrg cand_generation++; 1740b8e80941Smrg 1741b8e80941Smrg /* Shared resource: the mathbox. There's one mathbox per EU on Gen6+ 1742b8e80941Smrg * but it's more limited pre-gen6, so if we send something off to it then 1743b8e80941Smrg * the next math instruction isn't going to make progress until the first 1744b8e80941Smrg * is done. 1745b8e80941Smrg */ 1746b8e80941Smrg if (devinfo->gen < 6 && chosen->inst->is_math()) { 1747b8e80941Smrg foreach_in_list(schedule_node, n, &instructions) { 1748b8e80941Smrg if (n->inst->is_math()) 1749b8e80941Smrg n->unblocked_time = MAX2(n->unblocked_time, 1750b8e80941Smrg time + chosen->latency); 1751b8e80941Smrg } 1752b8e80941Smrg } 1753b8e80941Smrg } 1754b8e80941Smrg 1755b8e80941Smrg assert(instructions_to_schedule == 0); 1756b8e80941Smrg 1757b8e80941Smrg block->cycle_count = time; 1758b8e80941Smrg} 1759b8e80941Smrg 1760b8e80941Smrgstatic unsigned get_cycle_count(cfg_t *cfg) 1761b8e80941Smrg{ 1762b8e80941Smrg unsigned count = 0, multiplier = 1; 1763b8e80941Smrg foreach_block(block, cfg) { 1764b8e80941Smrg if (block->start()->opcode == BRW_OPCODE_DO) 1765b8e80941Smrg multiplier *= 10; /* assume that loops execute ~10 times */ 1766b8e80941Smrg 1767b8e80941Smrg count += block->cycle_count * multiplier; 1768b8e80941Smrg 1769b8e80941Smrg if (block->end()->opcode == BRW_OPCODE_WHILE) 1770b8e80941Smrg multiplier /= 10; 1771b8e80941Smrg } 1772b8e80941Smrg 1773b8e80941Smrg return count; 1774b8e80941Smrg} 1775b8e80941Smrg 1776b8e80941Smrgvoid 1777b8e80941Smrginstruction_scheduler::run(cfg_t *cfg) 1778b8e80941Smrg{ 1779b8e80941Smrg if (debug && !post_reg_alloc) { 1780b8e80941Smrg fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", 1781b8e80941Smrg post_reg_alloc); 1782b8e80941Smrg bs->dump_instructions(); 1783b8e80941Smrg } 1784b8e80941Smrg 1785b8e80941Smrg if (!post_reg_alloc) 1786b8e80941Smrg setup_liveness(cfg); 1787b8e80941Smrg 1788b8e80941Smrg foreach_block(block, cfg) { 1789b8e80941Smrg if (reads_remaining) { 1790b8e80941Smrg memset(reads_remaining, 0, 1791b8e80941Smrg grf_count * sizeof(*reads_remaining)); 1792b8e80941Smrg memset(hw_reads_remaining, 0, 1793b8e80941Smrg hw_reg_count * sizeof(*hw_reads_remaining)); 1794b8e80941Smrg memset(written, 0, grf_count * sizeof(*written)); 1795b8e80941Smrg 1796b8e80941Smrg foreach_inst_in_block(fs_inst, inst, block) 1797b8e80941Smrg count_reads_remaining(inst); 1798b8e80941Smrg } 1799b8e80941Smrg 1800b8e80941Smrg add_insts_from_block(block); 1801b8e80941Smrg 1802b8e80941Smrg calculate_deps(); 1803b8e80941Smrg 1804b8e80941Smrg compute_delays(); 1805b8e80941Smrg compute_exits(); 1806b8e80941Smrg 1807b8e80941Smrg schedule_instructions(block); 1808b8e80941Smrg } 1809b8e80941Smrg 1810b8e80941Smrg if (debug && !post_reg_alloc) { 1811b8e80941Smrg fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", 1812b8e80941Smrg post_reg_alloc); 1813b8e80941Smrg bs->dump_instructions(); 1814b8e80941Smrg } 1815b8e80941Smrg 1816b8e80941Smrg cfg->cycle_count = get_cycle_count(cfg); 1817b8e80941Smrg} 1818b8e80941Smrg 1819b8e80941Smrgvoid 1820b8e80941Smrgfs_visitor::schedule_instructions(instruction_scheduler_mode mode) 1821b8e80941Smrg{ 1822b8e80941Smrg if (mode != SCHEDULE_POST) 1823b8e80941Smrg calculate_live_intervals(); 1824b8e80941Smrg 1825b8e80941Smrg int grf_count; 1826b8e80941Smrg if (mode == SCHEDULE_POST) 1827b8e80941Smrg grf_count = grf_used; 1828b8e80941Smrg else 1829b8e80941Smrg grf_count = alloc.count; 1830b8e80941Smrg 1831b8e80941Smrg fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, 1832b8e80941Smrg cfg->num_blocks, mode); 1833b8e80941Smrg sched.run(cfg); 1834b8e80941Smrg 1835b8e80941Smrg invalidate_live_intervals(); 1836b8e80941Smrg} 1837b8e80941Smrg 1838b8e80941Smrgvoid 1839b8e80941Smrgvec4_visitor::opt_schedule_instructions() 1840b8e80941Smrg{ 1841b8e80941Smrg vec4_instruction_scheduler sched(this, prog_data->total_grf); 1842b8e80941Smrg sched.run(cfg); 1843b8e80941Smrg 1844b8e80941Smrg invalidate_live_intervals(); 1845b8e80941Smrg} 1846