101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2010 Intel Corporation 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg * 2301e04c3fSmrg * Authors: 2401e04c3fSmrg * Eric Anholt <eric@anholt.net> 2501e04c3fSmrg * 2601e04c3fSmrg */ 2701e04c3fSmrg 287ec681f3Smrg#include "brw_eu.h" 2901e04c3fSmrg#include "brw_fs.h" 3001e04c3fSmrg#include "brw_fs_live_variables.h" 3101e04c3fSmrg#include "brw_vec4.h" 3201e04c3fSmrg#include "brw_cfg.h" 3301e04c3fSmrg#include "brw_shader.h" 3401e04c3fSmrg 3501e04c3fSmrgusing namespace brw; 3601e04c3fSmrg 3701e04c3fSmrg/** @file brw_fs_schedule_instructions.cpp 3801e04c3fSmrg * 3901e04c3fSmrg * List scheduling of FS instructions. 4001e04c3fSmrg * 4101e04c3fSmrg * The basic model of the list scheduler is to take a basic block, 4201e04c3fSmrg * compute a DAG of the dependencies (RAW ordering with latency, WAW 4301e04c3fSmrg * ordering with latency, WAR ordering), and make a list of the DAG heads. 4401e04c3fSmrg * Heuristically pick a DAG head, then put all the children that are 4501e04c3fSmrg * now DAG heads into the list of things to schedule. 4601e04c3fSmrg * 4701e04c3fSmrg * The heuristic is the important part. We're trying to be cheap, 4801e04c3fSmrg * since actually computing the optimal scheduling is NP complete. 4901e04c3fSmrg * What we do is track a "current clock". When we schedule a node, we 5001e04c3fSmrg * update the earliest-unblocked clock time of its children, and 5101e04c3fSmrg * increment the clock. Then, when trying to schedule, we just pick 5201e04c3fSmrg * the earliest-unblocked instruction to schedule. 5301e04c3fSmrg * 5401e04c3fSmrg * Note that often there will be many things which could execute 5501e04c3fSmrg * immediately, and there are a range of heuristic options to choose 5601e04c3fSmrg * from in picking among those. 5701e04c3fSmrg */ 5801e04c3fSmrg 5901e04c3fSmrgstatic bool debug = false; 6001e04c3fSmrg 6101e04c3fSmrgclass instruction_scheduler; 6201e04c3fSmrg 6301e04c3fSmrgclass schedule_node : public exec_node 6401e04c3fSmrg{ 6501e04c3fSmrgpublic: 6601e04c3fSmrg schedule_node(backend_instruction *inst, instruction_scheduler *sched); 677ec681f3Smrg void set_latency_gfx4(); 687ec681f3Smrg void set_latency_gfx7(bool is_haswell); 6901e04c3fSmrg 707ec681f3Smrg const struct intel_device_info *devinfo; 7101e04c3fSmrg backend_instruction *inst; 7201e04c3fSmrg schedule_node **children; 7301e04c3fSmrg int *child_latency; 7401e04c3fSmrg int child_count; 7501e04c3fSmrg int parent_count; 7601e04c3fSmrg int child_array_size; 7701e04c3fSmrg int unblocked_time; 7801e04c3fSmrg int latency; 7901e04c3fSmrg 8001e04c3fSmrg /** 8101e04c3fSmrg * Which iteration of pushing groups of children onto the candidates list 8201e04c3fSmrg * this node was a part of. 8301e04c3fSmrg */ 8401e04c3fSmrg unsigned cand_generation; 8501e04c3fSmrg 8601e04c3fSmrg /** 8701e04c3fSmrg * This is the sum of the instruction's latency plus the maximum delay of 8801e04c3fSmrg * its children, or just the issue_time if it's a leaf node. 8901e04c3fSmrg */ 9001e04c3fSmrg int delay; 9101e04c3fSmrg 9201e04c3fSmrg /** 9301e04c3fSmrg * Preferred exit node among the (direct or indirect) successors of this 9401e04c3fSmrg * node. Among the scheduler nodes blocked by this node, this will be the 9501e04c3fSmrg * one that may cause earliest program termination, or NULL if none of the 9601e04c3fSmrg * successors is an exit node. 9701e04c3fSmrg */ 9801e04c3fSmrg schedule_node *exit; 9901e04c3fSmrg}; 10001e04c3fSmrg 10101e04c3fSmrg/** 10201e04c3fSmrg * Lower bound of the scheduling time after which one of the instructions 10301e04c3fSmrg * blocked by this node may lead to program termination. 10401e04c3fSmrg * 10501e04c3fSmrg * exit_unblocked_time() determines a strict partial ordering relation '«' on 10601e04c3fSmrg * the set of scheduler nodes as follows: 10701e04c3fSmrg * 10801e04c3fSmrg * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m) 10901e04c3fSmrg * 11001e04c3fSmrg * which can be used to heuristically order nodes according to how early they 11101e04c3fSmrg * can unblock an exit node and lead to program termination. 11201e04c3fSmrg */ 11301e04c3fSmrgstatic inline int 11401e04c3fSmrgexit_unblocked_time(const schedule_node *n) 11501e04c3fSmrg{ 11601e04c3fSmrg return n->exit ? n->exit->unblocked_time : INT_MAX; 11701e04c3fSmrg} 11801e04c3fSmrg 11901e04c3fSmrgvoid 1207ec681f3Smrgschedule_node::set_latency_gfx4() 12101e04c3fSmrg{ 12201e04c3fSmrg int chans = 8; 12301e04c3fSmrg int math_latency = 22; 12401e04c3fSmrg 12501e04c3fSmrg switch (inst->opcode) { 12601e04c3fSmrg case SHADER_OPCODE_RCP: 12701e04c3fSmrg this->latency = 1 * chans * math_latency; 12801e04c3fSmrg break; 12901e04c3fSmrg case SHADER_OPCODE_RSQ: 13001e04c3fSmrg this->latency = 2 * chans * math_latency; 13101e04c3fSmrg break; 13201e04c3fSmrg case SHADER_OPCODE_INT_QUOTIENT: 13301e04c3fSmrg case SHADER_OPCODE_SQRT: 13401e04c3fSmrg case SHADER_OPCODE_LOG2: 13501e04c3fSmrg /* full precision log. partial is 2. */ 13601e04c3fSmrg this->latency = 3 * chans * math_latency; 13701e04c3fSmrg break; 13801e04c3fSmrg case SHADER_OPCODE_INT_REMAINDER: 13901e04c3fSmrg case SHADER_OPCODE_EXP2: 14001e04c3fSmrg /* full precision. partial is 3, same throughput. */ 14101e04c3fSmrg this->latency = 4 * chans * math_latency; 14201e04c3fSmrg break; 14301e04c3fSmrg case SHADER_OPCODE_POW: 14401e04c3fSmrg this->latency = 8 * chans * math_latency; 14501e04c3fSmrg break; 14601e04c3fSmrg case SHADER_OPCODE_SIN: 14701e04c3fSmrg case SHADER_OPCODE_COS: 14801e04c3fSmrg /* minimum latency, max is 12 rounds. */ 14901e04c3fSmrg this->latency = 5 * chans * math_latency; 15001e04c3fSmrg break; 15101e04c3fSmrg default: 15201e04c3fSmrg this->latency = 2; 15301e04c3fSmrg break; 15401e04c3fSmrg } 15501e04c3fSmrg} 15601e04c3fSmrg 15701e04c3fSmrgvoid 1587ec681f3Smrgschedule_node::set_latency_gfx7(bool is_haswell) 15901e04c3fSmrg{ 16001e04c3fSmrg switch (inst->opcode) { 16101e04c3fSmrg case BRW_OPCODE_MAD: 16201e04c3fSmrg /* 2 cycles 16301e04c3fSmrg * (since the last two src operands are in different register banks): 16401e04c3fSmrg * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 16501e04c3fSmrg * 16601e04c3fSmrg * 3 cycles on IVB, 4 on HSW 16701e04c3fSmrg * (since the last two src operands are in the same register bank): 16801e04c3fSmrg * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 16901e04c3fSmrg * 17001e04c3fSmrg * 18 cycles on IVB, 16 on HSW 17101e04c3fSmrg * (since the last two src operands are in different register banks): 17201e04c3fSmrg * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 17301e04c3fSmrg * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q }; 17401e04c3fSmrg * 17501e04c3fSmrg * 20 cycles on IVB, 18 on HSW 17601e04c3fSmrg * (since the last two src operands are in the same register bank): 17701e04c3fSmrg * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 17801e04c3fSmrg * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 17901e04c3fSmrg */ 18001e04c3fSmrg 18101e04c3fSmrg /* Our register allocator doesn't know about register banks, so use the 18201e04c3fSmrg * higher latency. 18301e04c3fSmrg */ 18401e04c3fSmrg latency = is_haswell ? 16 : 18; 18501e04c3fSmrg break; 18601e04c3fSmrg 18701e04c3fSmrg case BRW_OPCODE_LRP: 18801e04c3fSmrg /* 2 cycles 18901e04c3fSmrg * (since the last two src operands are in different register banks): 19001e04c3fSmrg * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 19101e04c3fSmrg * 19201e04c3fSmrg * 3 cycles on IVB, 4 on HSW 19301e04c3fSmrg * (since the last two src operands are in the same register bank): 19401e04c3fSmrg * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 19501e04c3fSmrg * 19601e04c3fSmrg * 16 cycles on IVB, 14 on HSW 19701e04c3fSmrg * (since the last two src operands are in different register banks): 19801e04c3fSmrg * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; 19901e04c3fSmrg * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 20001e04c3fSmrg * 20101e04c3fSmrg * 16 cycles 20201e04c3fSmrg * (since the last two src operands are in the same register bank): 20301e04c3fSmrg * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; 20401e04c3fSmrg * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; 20501e04c3fSmrg */ 20601e04c3fSmrg 20701e04c3fSmrg /* Our register allocator doesn't know about register banks, so use the 20801e04c3fSmrg * higher latency. 20901e04c3fSmrg */ 21001e04c3fSmrg latency = 14; 21101e04c3fSmrg break; 21201e04c3fSmrg 21301e04c3fSmrg case SHADER_OPCODE_RCP: 21401e04c3fSmrg case SHADER_OPCODE_RSQ: 21501e04c3fSmrg case SHADER_OPCODE_SQRT: 21601e04c3fSmrg case SHADER_OPCODE_LOG2: 21701e04c3fSmrg case SHADER_OPCODE_EXP2: 21801e04c3fSmrg case SHADER_OPCODE_SIN: 21901e04c3fSmrg case SHADER_OPCODE_COS: 22001e04c3fSmrg /* 2 cycles: 22101e04c3fSmrg * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 22201e04c3fSmrg * 22301e04c3fSmrg * 18 cycles: 22401e04c3fSmrg * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; 22501e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 22601e04c3fSmrg * 22701e04c3fSmrg * Same for exp2, log2, rsq, sqrt, sin, cos. 22801e04c3fSmrg */ 22901e04c3fSmrg latency = is_haswell ? 14 : 16; 23001e04c3fSmrg break; 23101e04c3fSmrg 23201e04c3fSmrg case SHADER_OPCODE_POW: 23301e04c3fSmrg /* 2 cycles: 23401e04c3fSmrg * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 23501e04c3fSmrg * 23601e04c3fSmrg * 26 cycles: 23701e04c3fSmrg * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; 23801e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 23901e04c3fSmrg */ 24001e04c3fSmrg latency = is_haswell ? 22 : 24; 24101e04c3fSmrg break; 24201e04c3fSmrg 24301e04c3fSmrg case SHADER_OPCODE_TEX: 24401e04c3fSmrg case SHADER_OPCODE_TXD: 24501e04c3fSmrg case SHADER_OPCODE_TXF: 24601e04c3fSmrg case SHADER_OPCODE_TXF_LZ: 24701e04c3fSmrg case SHADER_OPCODE_TXL: 24801e04c3fSmrg case SHADER_OPCODE_TXL_LZ: 24901e04c3fSmrg /* 18 cycles: 25001e04c3fSmrg * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 25101e04c3fSmrg * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 25201e04c3fSmrg * send(8) g4<1>UW g114<8,8,1>F 25301e04c3fSmrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 25401e04c3fSmrg * 25501e04c3fSmrg * 697 +/-49 cycles (min 610, n=26): 25601e04c3fSmrg * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 25701e04c3fSmrg * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 25801e04c3fSmrg * send(8) g4<1>UW g114<8,8,1>F 25901e04c3fSmrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 26001e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 26101e04c3fSmrg * 26201e04c3fSmrg * So the latency on our first texture load of the batchbuffer takes 26301e04c3fSmrg * ~700 cycles, since the caches are cold at that point. 26401e04c3fSmrg * 26501e04c3fSmrg * 840 +/- 92 cycles (min 720, n=25): 26601e04c3fSmrg * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 26701e04c3fSmrg * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 26801e04c3fSmrg * send(8) g4<1>UW g114<8,8,1>F 26901e04c3fSmrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 27001e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 27101e04c3fSmrg * send(8) g4<1>UW g114<8,8,1>F 27201e04c3fSmrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 27301e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 27401e04c3fSmrg * 27501e04c3fSmrg * On the second load, it takes just an extra ~140 cycles, and after 27601e04c3fSmrg * accounting for the 14 cycles of the MOV's latency, that makes ~130. 27701e04c3fSmrg * 27801e04c3fSmrg * 683 +/- 49 cycles (min = 602, n=47): 27901e04c3fSmrg * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; 28001e04c3fSmrg * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; 28101e04c3fSmrg * send(8) g4<1>UW g114<8,8,1>F 28201e04c3fSmrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 28301e04c3fSmrg * send(8) g50<1>UW g114<8,8,1>F 28401e04c3fSmrg * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; 28501e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 28601e04c3fSmrg * 28701e04c3fSmrg * The unit appears to be pipelined, since this matches up with the 28801e04c3fSmrg * cache-cold case, despite there being two loads here. If you replace 28901e04c3fSmrg * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). 29001e04c3fSmrg * 29101e04c3fSmrg * So, take some number between the cache-hot 140 cycles and the 29201e04c3fSmrg * cache-cold 700 cycles. No particular tuning was done on this. 29301e04c3fSmrg * 29401e04c3fSmrg * I haven't done significant testing of the non-TEX opcodes. TXL at 29501e04c3fSmrg * least looked about the same as TEX. 29601e04c3fSmrg */ 29701e04c3fSmrg latency = 200; 29801e04c3fSmrg break; 29901e04c3fSmrg 30001e04c3fSmrg case SHADER_OPCODE_TXS: 30101e04c3fSmrg /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 30201e04c3fSmrg * cycles (n=15): 30301e04c3fSmrg * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; 30401e04c3fSmrg * send(8) g6<1>UW g114<8,8,1>F 30501e04c3fSmrg * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; 30601e04c3fSmrg * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; 30701e04c3fSmrg * 30801e04c3fSmrg * 30901e04c3fSmrg * Two loads was 535 +/- 30 cycles (n=19): 31001e04c3fSmrg * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 31101e04c3fSmrg * send(16) g6<1>UW g114<8,8,1>F 31201e04c3fSmrg * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 31301e04c3fSmrg * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; 31401e04c3fSmrg * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; 31501e04c3fSmrg * send(16) g8<1>UW g114<8,8,1>F 31601e04c3fSmrg * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; 31701e04c3fSmrg * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; 31801e04c3fSmrg * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; 31901e04c3fSmrg * 32001e04c3fSmrg * Since the only caches that should matter are just the 32101e04c3fSmrg * instruction/state cache containing the surface state, assume that we 32201e04c3fSmrg * always have hot caches. 32301e04c3fSmrg */ 32401e04c3fSmrg latency = 100; 32501e04c3fSmrg break; 32601e04c3fSmrg 3277ec681f3Smrg case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: 32801e04c3fSmrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 3297ec681f3Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: 33001e04c3fSmrg case VS_OPCODE_PULL_CONSTANT_LOAD: 33101e04c3fSmrg /* testing using varying-index pull constants: 33201e04c3fSmrg * 33301e04c3fSmrg * 16 cycles: 33401e04c3fSmrg * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 33501e04c3fSmrg * send(8) g4<1>F g4<8,8,1>D 33601e04c3fSmrg * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 33701e04c3fSmrg * 33801e04c3fSmrg * ~480 cycles: 33901e04c3fSmrg * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 34001e04c3fSmrg * send(8) g4<1>F g4<8,8,1>D 34101e04c3fSmrg * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 34201e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 34301e04c3fSmrg * 34401e04c3fSmrg * ~620 cycles: 34501e04c3fSmrg * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; 34601e04c3fSmrg * send(8) g4<1>F g4<8,8,1>D 34701e04c3fSmrg * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 34801e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 34901e04c3fSmrg * send(8) g4<1>F g4<8,8,1>D 35001e04c3fSmrg * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; 35101e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 35201e04c3fSmrg * 35301e04c3fSmrg * So, if it's cache-hot, it's about 140. If it's cache cold, it's 35401e04c3fSmrg * about 460. We expect to mostly be cache hot, so pick something more 35501e04c3fSmrg * in that direction. 35601e04c3fSmrg */ 35701e04c3fSmrg latency = 200; 35801e04c3fSmrg break; 35901e04c3fSmrg 3607ec681f3Smrg case SHADER_OPCODE_GFX7_SCRATCH_READ: 36101e04c3fSmrg /* Testing a load from offset 0, that had been previously written: 36201e04c3fSmrg * 36301e04c3fSmrg * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q }; 36401e04c3fSmrg * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q }; 36501e04c3fSmrg * 36601e04c3fSmrg * The cycles spent seemed to be grouped around 40-50 (as low as 38), 36701e04c3fSmrg * then around 140. Presumably this is cache hit vs miss. 36801e04c3fSmrg */ 36901e04c3fSmrg latency = 50; 37001e04c3fSmrg break; 37101e04c3fSmrg 3729f464c52Smaya case VEC4_OPCODE_UNTYPED_ATOMIC: 3737ec681f3Smrg /* See GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 37401e04c3fSmrg latency = 14000; 37501e04c3fSmrg break; 37601e04c3fSmrg 3779f464c52Smaya case VEC4_OPCODE_UNTYPED_SURFACE_READ: 3789f464c52Smaya case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: 3797ec681f3Smrg /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 38001e04c3fSmrg latency = is_haswell ? 300 : 600; 38101e04c3fSmrg break; 38201e04c3fSmrg 3839f464c52Smaya case SHADER_OPCODE_SEND: 3849f464c52Smaya switch (inst->sfid) { 3859f464c52Smaya case BRW_SFID_SAMPLER: { 3869f464c52Smaya unsigned msg_type = (inst->desc >> 12) & 0x1f; 3879f464c52Smaya switch (msg_type) { 3887ec681f3Smrg case GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO: 3897ec681f3Smrg case GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO: 3909f464c52Smaya /* See also SHADER_OPCODE_TXS */ 3919f464c52Smaya latency = 100; 3929f464c52Smaya break; 3939f464c52Smaya 3949f464c52Smaya default: 3959f464c52Smaya /* See also SHADER_OPCODE_TEX */ 3969f464c52Smaya latency = 200; 3979f464c52Smaya break; 3989f464c52Smaya } 3999f464c52Smaya break; 4009f464c52Smaya } 4019f464c52Smaya 4027ec681f3Smrg case GFX6_SFID_DATAPORT_RENDER_CACHE: 4037ec681f3Smrg switch (brw_fb_desc_msg_type(devinfo, inst->desc)) { 4047ec681f3Smrg case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE: 4057ec681f3Smrg case GFX7_DATAPORT_RC_TYPED_SURFACE_READ: 4069f464c52Smaya /* See also SHADER_OPCODE_TYPED_SURFACE_READ */ 4079f464c52Smaya assert(!is_haswell); 4089f464c52Smaya latency = 600; 4099f464c52Smaya break; 4109f464c52Smaya 4117ec681f3Smrg case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP: 4129f464c52Smaya /* See also SHADER_OPCODE_TYPED_ATOMIC */ 4139f464c52Smaya assert(!is_haswell); 4149f464c52Smaya latency = 14000; 4159f464c52Smaya break; 4169f464c52Smaya 4177ec681f3Smrg case GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE: 4187ec681f3Smrg /* completely fabricated number */ 4197ec681f3Smrg latency = 600; 4207ec681f3Smrg break; 4217ec681f3Smrg 4229f464c52Smaya default: 4239f464c52Smaya unreachable("Unknown render cache message"); 4249f464c52Smaya } 4259f464c52Smaya break; 4269f464c52Smaya 4277ec681f3Smrg case GFX7_SFID_DATAPORT_DATA_CACHE: 4289f464c52Smaya switch ((inst->desc >> 14) & 0x1f) { 4297ec681f3Smrg case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ: 4307ec681f3Smrg case GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ: 4317ec681f3Smrg case GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE: 4327ec681f3Smrg /* We have no data for this but assume it's a little faster than 4337ec681f3Smrg * untyped surface read/write. 4347ec681f3Smrg */ 4357ec681f3Smrg latency = 200; 4367ec681f3Smrg break; 4377ec681f3Smrg 4387ec681f3Smrg case GFX7_DATAPORT_DC_DWORD_SCATTERED_READ: 4397ec681f3Smrg case GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE: 4409f464c52Smaya case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ: 4419f464c52Smaya case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE: 4429f464c52Smaya /* We have no data for this but assume it's roughly the same as 4439f464c52Smaya * untyped surface read/write. 4449f464c52Smaya */ 4459f464c52Smaya latency = 300; 4469f464c52Smaya break; 4479f464c52Smaya 4487ec681f3Smrg case GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ: 4497ec681f3Smrg case GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE: 4509f464c52Smaya /* Test code: 4519f464c52Smaya * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 4529f464c52Smaya * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 4539f464c52Smaya * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 4549f464c52Smaya * send(8) g4<1>UD g112<8,8,1>UD 4559f464c52Smaya * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 4569f464c52Smaya * . 4579f464c52Smaya * . [repeats 8 times] 4589f464c52Smaya * . 4599f464c52Smaya * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; 4609f464c52Smaya * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; 4619f464c52Smaya * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; 4629f464c52Smaya * send(8) g4<1>UD g112<8,8,1>UD 4639f464c52Smaya * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; 4649f464c52Smaya * 4659f464c52Smaya * Running it 100 times as fragment shader on a 128x128 quad 4669f464c52Smaya * gives an average latency of 583 cycles per surface read, 4679f464c52Smaya * standard deviation 0.9%. 4689f464c52Smaya */ 4699f464c52Smaya assert(!is_haswell); 4709f464c52Smaya latency = 600; 4719f464c52Smaya break; 4729f464c52Smaya 4737ec681f3Smrg case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP: 4749f464c52Smaya /* Test code: 4759f464c52Smaya * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; 4769f464c52Smaya * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; 4779f464c52Smaya * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; 4789f464c52Smaya * send(8) g4<1>ud g112<8,8,1>ud 4799f464c52Smaya * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; 4809f464c52Smaya * 4819f464c52Smaya * Running it 100 times as fragment shader on a 128x128 quad 4829f464c52Smaya * gives an average latency of 13867 cycles per atomic op, 4839f464c52Smaya * standard deviation 3%. Note that this is a rather 4849f464c52Smaya * pessimistic estimate, the actual latency in cases with few 4859f464c52Smaya * collisions between threads and favorable pipelining has been 4869f464c52Smaya * seen to be reduced by a factor of 100. 4879f464c52Smaya */ 4889f464c52Smaya assert(!is_haswell); 4899f464c52Smaya latency = 14000; 4909f464c52Smaya break; 4919f464c52Smaya 4929f464c52Smaya default: 4939f464c52Smaya unreachable("Unknown data cache message"); 4949f464c52Smaya } 4959f464c52Smaya break; 4969f464c52Smaya 4979f464c52Smaya case HSW_SFID_DATAPORT_DATA_CACHE_1: 4989f464c52Smaya switch ((inst->desc >> 14) & 0x1f) { 4999f464c52Smaya case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ: 5009f464c52Smaya case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE: 5019f464c52Smaya case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ: 5029f464c52Smaya case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: 5037ec681f3Smrg case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE: 5047ec681f3Smrg case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: 5057ec681f3Smrg case GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE: 5067ec681f3Smrg case GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ: 5077ec681f3Smrg case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ: 5087ec681f3Smrg case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE: 5097ec681f3Smrg /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */ 5109f464c52Smaya latency = 300; 5119f464c52Smaya break; 5129f464c52Smaya 5139f464c52Smaya case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: 5149f464c52Smaya case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: 5159f464c52Smaya case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: 5169f464c52Smaya case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: 5177ec681f3Smrg case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP: 5187ec681f3Smrg case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP: 5197ec681f3Smrg case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP: 5207ec681f3Smrg case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP: 5217ec681f3Smrg case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP: 5227ec681f3Smrg /* See also GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */ 5239f464c52Smaya latency = 14000; 5249f464c52Smaya break; 5259f464c52Smaya 5269f464c52Smaya default: 5279f464c52Smaya unreachable("Unknown data cache message"); 5289f464c52Smaya } 5299f464c52Smaya break; 5309f464c52Smaya 5317ec681f3Smrg case GFX12_SFID_UGM: 5327ec681f3Smrg case GFX12_SFID_TGM: 5337ec681f3Smrg case GFX12_SFID_SLM: 5347ec681f3Smrg switch (lsc_msg_desc_opcode(devinfo, inst->desc)) { 5357ec681f3Smrg case LSC_OP_LOAD: 5367ec681f3Smrg case LSC_OP_STORE: 5377ec681f3Smrg case LSC_OP_LOAD_CMASK: 5387ec681f3Smrg case LSC_OP_STORE_CMASK: 5397ec681f3Smrg latency = 300; 5407ec681f3Smrg break; 5417ec681f3Smrg case LSC_OP_FENCE: 5427ec681f3Smrg case LSC_OP_ATOMIC_INC: 5437ec681f3Smrg case LSC_OP_ATOMIC_DEC: 5447ec681f3Smrg case LSC_OP_ATOMIC_LOAD: 5457ec681f3Smrg case LSC_OP_ATOMIC_STORE: 5467ec681f3Smrg case LSC_OP_ATOMIC_ADD: 5477ec681f3Smrg case LSC_OP_ATOMIC_SUB: 5487ec681f3Smrg case LSC_OP_ATOMIC_MIN: 5497ec681f3Smrg case LSC_OP_ATOMIC_MAX: 5507ec681f3Smrg case LSC_OP_ATOMIC_UMIN: 5517ec681f3Smrg case LSC_OP_ATOMIC_UMAX: 5527ec681f3Smrg case LSC_OP_ATOMIC_CMPXCHG: 5537ec681f3Smrg case LSC_OP_ATOMIC_FADD: 5547ec681f3Smrg case LSC_OP_ATOMIC_FSUB: 5557ec681f3Smrg case LSC_OP_ATOMIC_FMIN: 5567ec681f3Smrg case LSC_OP_ATOMIC_FMAX: 5577ec681f3Smrg case LSC_OP_ATOMIC_FCMPXCHG: 5587ec681f3Smrg case LSC_OP_ATOMIC_AND: 5597ec681f3Smrg case LSC_OP_ATOMIC_OR: 5607ec681f3Smrg case LSC_OP_ATOMIC_XOR: 5617ec681f3Smrg latency = 1400; 5627ec681f3Smrg break; 5637ec681f3Smrg default: 5647ec681f3Smrg unreachable("unsupported new data port message instruction"); 5657ec681f3Smrg } 5667ec681f3Smrg break; 5677ec681f3Smrg 5687ec681f3Smrg case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: 5697ec681f3Smrg case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: 5707ec681f3Smrg /* TODO. 5717ec681f3Smrg * 5727ec681f3Smrg * We'll assume for the moment that this is pretty quick as it 5737ec681f3Smrg * doesn't actually return any data. 5747ec681f3Smrg */ 5757ec681f3Smrg latency = 200; 5767ec681f3Smrg break; 5777ec681f3Smrg 5789f464c52Smaya default: 5799f464c52Smaya unreachable("Unknown SFID"); 5809f464c52Smaya } 5819f464c52Smaya break; 5829f464c52Smaya 58301e04c3fSmrg default: 58401e04c3fSmrg /* 2 cycles: 58501e04c3fSmrg * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 58601e04c3fSmrg * 58701e04c3fSmrg * 16 cycles: 58801e04c3fSmrg * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; 58901e04c3fSmrg * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; 59001e04c3fSmrg */ 59101e04c3fSmrg latency = 14; 59201e04c3fSmrg break; 59301e04c3fSmrg } 59401e04c3fSmrg} 59501e04c3fSmrg 59601e04c3fSmrgclass instruction_scheduler { 59701e04c3fSmrgpublic: 5987ec681f3Smrg instruction_scheduler(const backend_shader *s, int grf_count, 5999f464c52Smaya unsigned hw_reg_count, int block_count, 6007ec681f3Smrg instruction_scheduler_mode mode): 6017ec681f3Smrg bs(s) 60201e04c3fSmrg { 60301e04c3fSmrg this->mem_ctx = ralloc_context(NULL); 60401e04c3fSmrg this->grf_count = grf_count; 60501e04c3fSmrg this->hw_reg_count = hw_reg_count; 60601e04c3fSmrg this->instructions.make_empty(); 60701e04c3fSmrg this->post_reg_alloc = (mode == SCHEDULE_POST); 60801e04c3fSmrg this->mode = mode; 6097ec681f3Smrg this->reg_pressure = 0; 6107ec681f3Smrg this->block_idx = 0; 61101e04c3fSmrg if (!post_reg_alloc) { 61201e04c3fSmrg this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); 61301e04c3fSmrg 61401e04c3fSmrg this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 61501e04c3fSmrg for (int i = 0; i < block_count; i++) 61601e04c3fSmrg this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, 61701e04c3fSmrg BITSET_WORDS(grf_count)); 61801e04c3fSmrg 61901e04c3fSmrg this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 62001e04c3fSmrg for (int i = 0; i < block_count; i++) 62101e04c3fSmrg this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 62201e04c3fSmrg BITSET_WORDS(grf_count)); 62301e04c3fSmrg 62401e04c3fSmrg this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); 62501e04c3fSmrg for (int i = 0; i < block_count; i++) 62601e04c3fSmrg this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, 62701e04c3fSmrg BITSET_WORDS(hw_reg_count)); 62801e04c3fSmrg 62901e04c3fSmrg this->written = rzalloc_array(mem_ctx, bool, grf_count); 63001e04c3fSmrg 63101e04c3fSmrg this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); 63201e04c3fSmrg 63301e04c3fSmrg this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count); 63401e04c3fSmrg } else { 63501e04c3fSmrg this->reg_pressure_in = NULL; 63601e04c3fSmrg this->livein = NULL; 63701e04c3fSmrg this->liveout = NULL; 63801e04c3fSmrg this->hw_liveout = NULL; 63901e04c3fSmrg this->written = NULL; 64001e04c3fSmrg this->reads_remaining = NULL; 64101e04c3fSmrg this->hw_reads_remaining = NULL; 64201e04c3fSmrg } 64301e04c3fSmrg } 64401e04c3fSmrg 64501e04c3fSmrg ~instruction_scheduler() 64601e04c3fSmrg { 64701e04c3fSmrg ralloc_free(this->mem_ctx); 64801e04c3fSmrg } 64901e04c3fSmrg void add_barrier_deps(schedule_node *n); 65001e04c3fSmrg void add_dep(schedule_node *before, schedule_node *after, int latency); 65101e04c3fSmrg void add_dep(schedule_node *before, schedule_node *after); 65201e04c3fSmrg 65301e04c3fSmrg void run(cfg_t *cfg); 65401e04c3fSmrg void add_insts_from_block(bblock_t *block); 65501e04c3fSmrg void compute_delays(); 65601e04c3fSmrg void compute_exits(); 65701e04c3fSmrg virtual void calculate_deps() = 0; 65801e04c3fSmrg virtual schedule_node *choose_instruction_to_schedule() = 0; 65901e04c3fSmrg 66001e04c3fSmrg /** 66101e04c3fSmrg * Returns how many cycles it takes the instruction to issue. 66201e04c3fSmrg * 66301e04c3fSmrg * Instructions in gen hardware are handled one simd4 vector at a time, 66401e04c3fSmrg * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2 66501e04c3fSmrg * cycles to dispatch and SIMD16 (compressed) instructions take 4. 66601e04c3fSmrg */ 66701e04c3fSmrg virtual int issue_time(backend_instruction *inst) = 0; 66801e04c3fSmrg 66901e04c3fSmrg virtual void count_reads_remaining(backend_instruction *inst) = 0; 67001e04c3fSmrg virtual void setup_liveness(cfg_t *cfg) = 0; 67101e04c3fSmrg virtual void update_register_pressure(backend_instruction *inst) = 0; 67201e04c3fSmrg virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; 67301e04c3fSmrg 67401e04c3fSmrg void schedule_instructions(bblock_t *block); 67501e04c3fSmrg 67601e04c3fSmrg void *mem_ctx; 67701e04c3fSmrg 67801e04c3fSmrg bool post_reg_alloc; 67901e04c3fSmrg int grf_count; 6809f464c52Smaya unsigned hw_reg_count; 68101e04c3fSmrg int reg_pressure; 68201e04c3fSmrg int block_idx; 68301e04c3fSmrg exec_list instructions; 6847ec681f3Smrg const backend_shader *bs; 68501e04c3fSmrg 68601e04c3fSmrg instruction_scheduler_mode mode; 68701e04c3fSmrg 68801e04c3fSmrg /* 68901e04c3fSmrg * The register pressure at the beginning of each basic block. 69001e04c3fSmrg */ 69101e04c3fSmrg 69201e04c3fSmrg int *reg_pressure_in; 69301e04c3fSmrg 69401e04c3fSmrg /* 69501e04c3fSmrg * The virtual GRF's whose range overlaps the beginning of each basic block. 69601e04c3fSmrg */ 69701e04c3fSmrg 69801e04c3fSmrg BITSET_WORD **livein; 69901e04c3fSmrg 70001e04c3fSmrg /* 70101e04c3fSmrg * The virtual GRF's whose range overlaps the end of each basic block. 70201e04c3fSmrg */ 70301e04c3fSmrg 70401e04c3fSmrg BITSET_WORD **liveout; 70501e04c3fSmrg 70601e04c3fSmrg /* 70701e04c3fSmrg * The hardware GRF's whose range overlaps the end of each basic block. 70801e04c3fSmrg */ 70901e04c3fSmrg 71001e04c3fSmrg BITSET_WORD **hw_liveout; 71101e04c3fSmrg 71201e04c3fSmrg /* 71301e04c3fSmrg * Whether we've scheduled a write for this virtual GRF yet. 71401e04c3fSmrg */ 71501e04c3fSmrg 71601e04c3fSmrg bool *written; 71701e04c3fSmrg 71801e04c3fSmrg /* 71901e04c3fSmrg * How many reads we haven't scheduled for this virtual GRF yet. 72001e04c3fSmrg */ 72101e04c3fSmrg 72201e04c3fSmrg int *reads_remaining; 72301e04c3fSmrg 72401e04c3fSmrg /* 72501e04c3fSmrg * How many reads we haven't scheduled for this hardware GRF yet. 72601e04c3fSmrg */ 72701e04c3fSmrg 72801e04c3fSmrg int *hw_reads_remaining; 72901e04c3fSmrg}; 73001e04c3fSmrg 73101e04c3fSmrgclass fs_instruction_scheduler : public instruction_scheduler 73201e04c3fSmrg{ 73301e04c3fSmrgpublic: 7347ec681f3Smrg fs_instruction_scheduler(const fs_visitor *v, int grf_count, int hw_reg_count, 73501e04c3fSmrg int block_count, 73601e04c3fSmrg instruction_scheduler_mode mode); 73701e04c3fSmrg void calculate_deps(); 7387ec681f3Smrg bool is_compressed(const fs_inst *inst); 73901e04c3fSmrg schedule_node *choose_instruction_to_schedule(); 74001e04c3fSmrg int issue_time(backend_instruction *inst); 7417ec681f3Smrg const fs_visitor *v; 74201e04c3fSmrg 74301e04c3fSmrg void count_reads_remaining(backend_instruction *inst); 74401e04c3fSmrg void setup_liveness(cfg_t *cfg); 74501e04c3fSmrg void update_register_pressure(backend_instruction *inst); 74601e04c3fSmrg int get_register_pressure_benefit(backend_instruction *inst); 74701e04c3fSmrg}; 74801e04c3fSmrg 7497ec681f3Smrgfs_instruction_scheduler::fs_instruction_scheduler(const fs_visitor *v, 75001e04c3fSmrg int grf_count, int hw_reg_count, 75101e04c3fSmrg int block_count, 75201e04c3fSmrg instruction_scheduler_mode mode) 75301e04c3fSmrg : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), 75401e04c3fSmrg v(v) 75501e04c3fSmrg{ 75601e04c3fSmrg} 75701e04c3fSmrg 75801e04c3fSmrgstatic bool 75901e04c3fSmrgis_src_duplicate(fs_inst *inst, int src) 76001e04c3fSmrg{ 76101e04c3fSmrg for (int i = 0; i < src; i++) 76201e04c3fSmrg if (inst->src[i].equals(inst->src[src])) 76301e04c3fSmrg return true; 76401e04c3fSmrg 76501e04c3fSmrg return false; 76601e04c3fSmrg} 76701e04c3fSmrg 76801e04c3fSmrgvoid 76901e04c3fSmrgfs_instruction_scheduler::count_reads_remaining(backend_instruction *be) 77001e04c3fSmrg{ 77101e04c3fSmrg fs_inst *inst = (fs_inst *)be; 77201e04c3fSmrg 77301e04c3fSmrg if (!reads_remaining) 77401e04c3fSmrg return; 77501e04c3fSmrg 77601e04c3fSmrg for (int i = 0; i < inst->sources; i++) { 77701e04c3fSmrg if (is_src_duplicate(inst, i)) 77801e04c3fSmrg continue; 77901e04c3fSmrg 78001e04c3fSmrg if (inst->src[i].file == VGRF) { 78101e04c3fSmrg reads_remaining[inst->src[i].nr]++; 78201e04c3fSmrg } else if (inst->src[i].file == FIXED_GRF) { 78301e04c3fSmrg if (inst->src[i].nr >= hw_reg_count) 78401e04c3fSmrg continue; 78501e04c3fSmrg 78601e04c3fSmrg for (unsigned j = 0; j < regs_read(inst, i); j++) 78701e04c3fSmrg hw_reads_remaining[inst->src[i].nr + j]++; 78801e04c3fSmrg } 78901e04c3fSmrg } 79001e04c3fSmrg} 79101e04c3fSmrg 79201e04c3fSmrgvoid 79301e04c3fSmrgfs_instruction_scheduler::setup_liveness(cfg_t *cfg) 79401e04c3fSmrg{ 7957ec681f3Smrg const fs_live_variables &live = v->live_analysis.require(); 7967ec681f3Smrg 79701e04c3fSmrg /* First, compute liveness on a per-GRF level using the in/out sets from 79801e04c3fSmrg * liveness calculation. 79901e04c3fSmrg */ 80001e04c3fSmrg for (int block = 0; block < cfg->num_blocks; block++) { 8017ec681f3Smrg for (int i = 0; i < live.num_vars; i++) { 8027ec681f3Smrg if (BITSET_TEST(live.block_data[block].livein, i)) { 8037ec681f3Smrg int vgrf = live.vgrf_from_var[i]; 80401e04c3fSmrg if (!BITSET_TEST(livein[block], vgrf)) { 80501e04c3fSmrg reg_pressure_in[block] += v->alloc.sizes[vgrf]; 80601e04c3fSmrg BITSET_SET(livein[block], vgrf); 80701e04c3fSmrg } 80801e04c3fSmrg } 80901e04c3fSmrg 8107ec681f3Smrg if (BITSET_TEST(live.block_data[block].liveout, i)) 8117ec681f3Smrg BITSET_SET(liveout[block], live.vgrf_from_var[i]); 81201e04c3fSmrg } 81301e04c3fSmrg } 81401e04c3fSmrg 81501e04c3fSmrg /* Now, extend the live in/live out sets for when a range crosses a block 81601e04c3fSmrg * boundary, which matches what our register allocator/interference code 81701e04c3fSmrg * does to account for force_writemask_all and incompatible exec_mask's. 81801e04c3fSmrg */ 81901e04c3fSmrg for (int block = 0; block < cfg->num_blocks - 1; block++) { 82001e04c3fSmrg for (int i = 0; i < grf_count; i++) { 8217ec681f3Smrg if (live.vgrf_start[i] <= cfg->blocks[block]->end_ip && 8227ec681f3Smrg live.vgrf_end[i] >= cfg->blocks[block + 1]->start_ip) { 82301e04c3fSmrg if (!BITSET_TEST(livein[block + 1], i)) { 82401e04c3fSmrg reg_pressure_in[block + 1] += v->alloc.sizes[i]; 82501e04c3fSmrg BITSET_SET(livein[block + 1], i); 82601e04c3fSmrg } 82701e04c3fSmrg 82801e04c3fSmrg BITSET_SET(liveout[block], i); 82901e04c3fSmrg } 83001e04c3fSmrg } 83101e04c3fSmrg } 83201e04c3fSmrg 83301e04c3fSmrg int payload_last_use_ip[hw_reg_count]; 83401e04c3fSmrg v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); 83501e04c3fSmrg 8369f464c52Smaya for (unsigned i = 0; i < hw_reg_count; i++) { 83701e04c3fSmrg if (payload_last_use_ip[i] == -1) 83801e04c3fSmrg continue; 83901e04c3fSmrg 84001e04c3fSmrg for (int block = 0; block < cfg->num_blocks; block++) { 84101e04c3fSmrg if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) 84201e04c3fSmrg reg_pressure_in[block]++; 84301e04c3fSmrg 84401e04c3fSmrg if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) 84501e04c3fSmrg BITSET_SET(hw_liveout[block], i); 84601e04c3fSmrg } 84701e04c3fSmrg } 84801e04c3fSmrg} 84901e04c3fSmrg 85001e04c3fSmrgvoid 85101e04c3fSmrgfs_instruction_scheduler::update_register_pressure(backend_instruction *be) 85201e04c3fSmrg{ 85301e04c3fSmrg fs_inst *inst = (fs_inst *)be; 85401e04c3fSmrg 85501e04c3fSmrg if (!reads_remaining) 85601e04c3fSmrg return; 85701e04c3fSmrg 85801e04c3fSmrg if (inst->dst.file == VGRF) { 85901e04c3fSmrg written[inst->dst.nr] = true; 86001e04c3fSmrg } 86101e04c3fSmrg 86201e04c3fSmrg for (int i = 0; i < inst->sources; i++) { 86301e04c3fSmrg if (is_src_duplicate(inst, i)) 86401e04c3fSmrg continue; 86501e04c3fSmrg 86601e04c3fSmrg if (inst->src[i].file == VGRF) { 86701e04c3fSmrg reads_remaining[inst->src[i].nr]--; 86801e04c3fSmrg } else if (inst->src[i].file == FIXED_GRF && 86901e04c3fSmrg inst->src[i].nr < hw_reg_count) { 87001e04c3fSmrg for (unsigned off = 0; off < regs_read(inst, i); off++) 87101e04c3fSmrg hw_reads_remaining[inst->src[i].nr + off]--; 87201e04c3fSmrg } 87301e04c3fSmrg } 87401e04c3fSmrg} 87501e04c3fSmrg 87601e04c3fSmrgint 87701e04c3fSmrgfs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) 87801e04c3fSmrg{ 87901e04c3fSmrg fs_inst *inst = (fs_inst *)be; 88001e04c3fSmrg int benefit = 0; 88101e04c3fSmrg 88201e04c3fSmrg if (inst->dst.file == VGRF) { 88301e04c3fSmrg if (!BITSET_TEST(livein[block_idx], inst->dst.nr) && 88401e04c3fSmrg !written[inst->dst.nr]) 88501e04c3fSmrg benefit -= v->alloc.sizes[inst->dst.nr]; 88601e04c3fSmrg } 88701e04c3fSmrg 88801e04c3fSmrg for (int i = 0; i < inst->sources; i++) { 88901e04c3fSmrg if (is_src_duplicate(inst, i)) 89001e04c3fSmrg continue; 89101e04c3fSmrg 89201e04c3fSmrg if (inst->src[i].file == VGRF && 89301e04c3fSmrg !BITSET_TEST(liveout[block_idx], inst->src[i].nr) && 89401e04c3fSmrg reads_remaining[inst->src[i].nr] == 1) 89501e04c3fSmrg benefit += v->alloc.sizes[inst->src[i].nr]; 89601e04c3fSmrg 89701e04c3fSmrg if (inst->src[i].file == FIXED_GRF && 89801e04c3fSmrg inst->src[i].nr < hw_reg_count) { 89901e04c3fSmrg for (unsigned off = 0; off < regs_read(inst, i); off++) { 90001e04c3fSmrg int reg = inst->src[i].nr + off; 90101e04c3fSmrg if (!BITSET_TEST(hw_liveout[block_idx], reg) && 90201e04c3fSmrg hw_reads_remaining[reg] == 1) { 90301e04c3fSmrg benefit++; 90401e04c3fSmrg } 90501e04c3fSmrg } 90601e04c3fSmrg } 90701e04c3fSmrg } 90801e04c3fSmrg 90901e04c3fSmrg return benefit; 91001e04c3fSmrg} 91101e04c3fSmrg 91201e04c3fSmrgclass vec4_instruction_scheduler : public instruction_scheduler 91301e04c3fSmrg{ 91401e04c3fSmrgpublic: 9157ec681f3Smrg vec4_instruction_scheduler(const vec4_visitor *v, int grf_count); 91601e04c3fSmrg void calculate_deps(); 91701e04c3fSmrg schedule_node *choose_instruction_to_schedule(); 91801e04c3fSmrg int issue_time(backend_instruction *inst); 9197ec681f3Smrg const vec4_visitor *v; 92001e04c3fSmrg 92101e04c3fSmrg void count_reads_remaining(backend_instruction *inst); 92201e04c3fSmrg void setup_liveness(cfg_t *cfg); 92301e04c3fSmrg void update_register_pressure(backend_instruction *inst); 92401e04c3fSmrg int get_register_pressure_benefit(backend_instruction *inst); 92501e04c3fSmrg}; 92601e04c3fSmrg 9277ec681f3Smrgvec4_instruction_scheduler::vec4_instruction_scheduler(const vec4_visitor *v, 92801e04c3fSmrg int grf_count) 92901e04c3fSmrg : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), 93001e04c3fSmrg v(v) 93101e04c3fSmrg{ 93201e04c3fSmrg} 93301e04c3fSmrg 93401e04c3fSmrgvoid 93501e04c3fSmrgvec4_instruction_scheduler::count_reads_remaining(backend_instruction *) 93601e04c3fSmrg{ 93701e04c3fSmrg} 93801e04c3fSmrg 93901e04c3fSmrgvoid 94001e04c3fSmrgvec4_instruction_scheduler::setup_liveness(cfg_t *) 94101e04c3fSmrg{ 94201e04c3fSmrg} 94301e04c3fSmrg 94401e04c3fSmrgvoid 94501e04c3fSmrgvec4_instruction_scheduler::update_register_pressure(backend_instruction *) 94601e04c3fSmrg{ 94701e04c3fSmrg} 94801e04c3fSmrg 94901e04c3fSmrgint 95001e04c3fSmrgvec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *) 95101e04c3fSmrg{ 95201e04c3fSmrg return 0; 95301e04c3fSmrg} 95401e04c3fSmrg 95501e04c3fSmrgschedule_node::schedule_node(backend_instruction *inst, 95601e04c3fSmrg instruction_scheduler *sched) 95701e04c3fSmrg{ 9587ec681f3Smrg const struct intel_device_info *devinfo = sched->bs->devinfo; 95901e04c3fSmrg 9607ec681f3Smrg this->devinfo = devinfo; 96101e04c3fSmrg this->inst = inst; 96201e04c3fSmrg this->child_array_size = 0; 96301e04c3fSmrg this->children = NULL; 96401e04c3fSmrg this->child_latency = NULL; 96501e04c3fSmrg this->child_count = 0; 96601e04c3fSmrg this->parent_count = 0; 96701e04c3fSmrg this->unblocked_time = 0; 96801e04c3fSmrg this->cand_generation = 0; 96901e04c3fSmrg this->delay = 0; 97001e04c3fSmrg this->exit = NULL; 97101e04c3fSmrg 9727ec681f3Smrg /* We can't measure Gfx6 timings directly but expect them to be much 9737ec681f3Smrg * closer to Gfx7 than Gfx4. 97401e04c3fSmrg */ 97501e04c3fSmrg if (!sched->post_reg_alloc) 97601e04c3fSmrg this->latency = 1; 9777ec681f3Smrg else if (devinfo->ver >= 6) 9787ec681f3Smrg set_latency_gfx7(devinfo->is_haswell); 97901e04c3fSmrg else 9807ec681f3Smrg set_latency_gfx4(); 98101e04c3fSmrg} 98201e04c3fSmrg 98301e04c3fSmrgvoid 98401e04c3fSmrginstruction_scheduler::add_insts_from_block(bblock_t *block) 98501e04c3fSmrg{ 98601e04c3fSmrg foreach_inst_in_block(backend_instruction, inst, block) { 98701e04c3fSmrg schedule_node *n = new(mem_ctx) schedule_node(inst, this); 98801e04c3fSmrg 98901e04c3fSmrg instructions.push_tail(n); 99001e04c3fSmrg } 99101e04c3fSmrg} 99201e04c3fSmrg 99301e04c3fSmrg/** Computation of the delay member of each node. */ 99401e04c3fSmrgvoid 99501e04c3fSmrginstruction_scheduler::compute_delays() 99601e04c3fSmrg{ 99701e04c3fSmrg foreach_in_list_reverse(schedule_node, n, &instructions) { 99801e04c3fSmrg if (!n->child_count) { 99901e04c3fSmrg n->delay = issue_time(n->inst); 100001e04c3fSmrg } else { 100101e04c3fSmrg for (int i = 0; i < n->child_count; i++) { 100201e04c3fSmrg assert(n->children[i]->delay); 100301e04c3fSmrg n->delay = MAX2(n->delay, n->latency + n->children[i]->delay); 100401e04c3fSmrg } 100501e04c3fSmrg } 100601e04c3fSmrg } 100701e04c3fSmrg} 100801e04c3fSmrg 100901e04c3fSmrgvoid 101001e04c3fSmrginstruction_scheduler::compute_exits() 101101e04c3fSmrg{ 101201e04c3fSmrg /* Calculate a lower bound of the scheduling time of each node in the 101301e04c3fSmrg * graph. This is analogous to the node's critical path but calculated 101401e04c3fSmrg * from the top instead of from the bottom of the block. 101501e04c3fSmrg */ 101601e04c3fSmrg foreach_in_list(schedule_node, n, &instructions) { 101701e04c3fSmrg for (int i = 0; i < n->child_count; i++) { 101801e04c3fSmrg n->children[i]->unblocked_time = 101901e04c3fSmrg MAX2(n->children[i]->unblocked_time, 102001e04c3fSmrg n->unblocked_time + issue_time(n->inst) + n->child_latency[i]); 102101e04c3fSmrg } 102201e04c3fSmrg } 102301e04c3fSmrg 102401e04c3fSmrg /* Calculate the exit of each node by induction based on the exit nodes of 102501e04c3fSmrg * its children. The preferred exit of a node is the one among the exit 102601e04c3fSmrg * nodes of its children which can be unblocked first according to the 102701e04c3fSmrg * optimistic unblocked time estimate calculated above. 102801e04c3fSmrg */ 102901e04c3fSmrg foreach_in_list_reverse(schedule_node, n, &instructions) { 10307ec681f3Smrg n->exit = (n->inst->opcode == BRW_OPCODE_HALT ? n : NULL); 103101e04c3fSmrg 103201e04c3fSmrg for (int i = 0; i < n->child_count; i++) { 103301e04c3fSmrg if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n)) 103401e04c3fSmrg n->exit = n->children[i]->exit; 103501e04c3fSmrg } 103601e04c3fSmrg } 103701e04c3fSmrg} 103801e04c3fSmrg 103901e04c3fSmrg/** 104001e04c3fSmrg * Add a dependency between two instruction nodes. 104101e04c3fSmrg * 104201e04c3fSmrg * The @after node will be scheduled after @before. We will try to 104301e04c3fSmrg * schedule it @latency cycles after @before, but no guarantees there. 104401e04c3fSmrg */ 104501e04c3fSmrgvoid 104601e04c3fSmrginstruction_scheduler::add_dep(schedule_node *before, schedule_node *after, 104701e04c3fSmrg int latency) 104801e04c3fSmrg{ 104901e04c3fSmrg if (!before || !after) 105001e04c3fSmrg return; 105101e04c3fSmrg 105201e04c3fSmrg assert(before != after); 105301e04c3fSmrg 105401e04c3fSmrg for (int i = 0; i < before->child_count; i++) { 105501e04c3fSmrg if (before->children[i] == after) { 105601e04c3fSmrg before->child_latency[i] = MAX2(before->child_latency[i], latency); 105701e04c3fSmrg return; 105801e04c3fSmrg } 105901e04c3fSmrg } 106001e04c3fSmrg 106101e04c3fSmrg if (before->child_array_size <= before->child_count) { 106201e04c3fSmrg if (before->child_array_size < 16) 106301e04c3fSmrg before->child_array_size = 16; 106401e04c3fSmrg else 106501e04c3fSmrg before->child_array_size *= 2; 106601e04c3fSmrg 106701e04c3fSmrg before->children = reralloc(mem_ctx, before->children, 106801e04c3fSmrg schedule_node *, 106901e04c3fSmrg before->child_array_size); 107001e04c3fSmrg before->child_latency = reralloc(mem_ctx, before->child_latency, 107101e04c3fSmrg int, before->child_array_size); 107201e04c3fSmrg } 107301e04c3fSmrg 107401e04c3fSmrg before->children[before->child_count] = after; 107501e04c3fSmrg before->child_latency[before->child_count] = latency; 107601e04c3fSmrg before->child_count++; 107701e04c3fSmrg after->parent_count++; 107801e04c3fSmrg} 107901e04c3fSmrg 108001e04c3fSmrgvoid 108101e04c3fSmrginstruction_scheduler::add_dep(schedule_node *before, schedule_node *after) 108201e04c3fSmrg{ 108301e04c3fSmrg if (!before) 108401e04c3fSmrg return; 108501e04c3fSmrg 108601e04c3fSmrg add_dep(before, after, before->latency); 108701e04c3fSmrg} 108801e04c3fSmrg 108901e04c3fSmrgstatic bool 109001e04c3fSmrgis_scheduling_barrier(const backend_instruction *inst) 109101e04c3fSmrg{ 10927ec681f3Smrg return inst->opcode == SHADER_OPCODE_HALT_TARGET || 109301e04c3fSmrg inst->is_control_flow() || 109401e04c3fSmrg inst->has_side_effects(); 109501e04c3fSmrg} 109601e04c3fSmrg 109701e04c3fSmrg/** 109801e04c3fSmrg * Sometimes we really want this node to execute after everything that 109901e04c3fSmrg * was before it and before everything that followed it. This adds 110001e04c3fSmrg * the deps to do so. 110101e04c3fSmrg */ 110201e04c3fSmrgvoid 110301e04c3fSmrginstruction_scheduler::add_barrier_deps(schedule_node *n) 110401e04c3fSmrg{ 110501e04c3fSmrg schedule_node *prev = (schedule_node *)n->prev; 110601e04c3fSmrg schedule_node *next = (schedule_node *)n->next; 110701e04c3fSmrg 110801e04c3fSmrg if (prev) { 110901e04c3fSmrg while (!prev->is_head_sentinel()) { 111001e04c3fSmrg add_dep(prev, n, 0); 111101e04c3fSmrg if (is_scheduling_barrier(prev->inst)) 111201e04c3fSmrg break; 111301e04c3fSmrg prev = (schedule_node *)prev->prev; 111401e04c3fSmrg } 111501e04c3fSmrg } 111601e04c3fSmrg 111701e04c3fSmrg if (next) { 111801e04c3fSmrg while (!next->is_tail_sentinel()) { 111901e04c3fSmrg add_dep(n, next, 0); 112001e04c3fSmrg if (is_scheduling_barrier(next->inst)) 112101e04c3fSmrg break; 112201e04c3fSmrg next = (schedule_node *)next->next; 112301e04c3fSmrg } 112401e04c3fSmrg } 112501e04c3fSmrg} 112601e04c3fSmrg 112701e04c3fSmrg/* instruction scheduling needs to be aware of when an MRF write 112801e04c3fSmrg * actually writes 2 MRFs. 112901e04c3fSmrg */ 113001e04c3fSmrgbool 11317ec681f3Smrgfs_instruction_scheduler::is_compressed(const fs_inst *inst) 113201e04c3fSmrg{ 113301e04c3fSmrg return inst->exec_size == 16; 113401e04c3fSmrg} 113501e04c3fSmrg 113601e04c3fSmrgvoid 113701e04c3fSmrgfs_instruction_scheduler::calculate_deps() 113801e04c3fSmrg{ 113901e04c3fSmrg /* Pre-register-allocation, this tracks the last write per VGRF offset. 114001e04c3fSmrg * After register allocation, reg_offsets are gone and we track individual 114101e04c3fSmrg * GRF registers. 114201e04c3fSmrg */ 11439f464c52Smaya schedule_node **last_grf_write; 11447ec681f3Smrg schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)]; 114501e04c3fSmrg schedule_node *last_conditional_mod[8] = {}; 114601e04c3fSmrg schedule_node *last_accumulator_write = NULL; 114701e04c3fSmrg /* Fixed HW registers are assumed to be separate from the virtual 114801e04c3fSmrg * GRFs, so they can be tracked separately. We don't really write 114901e04c3fSmrg * to fixed GRFs much, so don't bother tracking them on a more 115001e04c3fSmrg * granular level. 115101e04c3fSmrg */ 115201e04c3fSmrg schedule_node *last_fixed_grf_write = NULL; 115301e04c3fSmrg 11549f464c52Smaya last_grf_write = (schedule_node **)calloc(sizeof(schedule_node *), grf_count * 16); 115501e04c3fSmrg memset(last_mrf_write, 0, sizeof(last_mrf_write)); 115601e04c3fSmrg 115701e04c3fSmrg /* top-to-bottom dependencies: RAW and WAW. */ 115801e04c3fSmrg foreach_in_list(schedule_node, n, &instructions) { 115901e04c3fSmrg fs_inst *inst = (fs_inst *)n->inst; 116001e04c3fSmrg 116101e04c3fSmrg if (is_scheduling_barrier(inst)) 116201e04c3fSmrg add_barrier_deps(n); 116301e04c3fSmrg 116401e04c3fSmrg /* read-after-write deps. */ 116501e04c3fSmrg for (int i = 0; i < inst->sources; i++) { 116601e04c3fSmrg if (inst->src[i].file == VGRF) { 116701e04c3fSmrg if (post_reg_alloc) { 116801e04c3fSmrg for (unsigned r = 0; r < regs_read(inst, i); r++) 116901e04c3fSmrg add_dep(last_grf_write[inst->src[i].nr + r], n); 117001e04c3fSmrg } else { 117101e04c3fSmrg for (unsigned r = 0; r < regs_read(inst, i); r++) { 117201e04c3fSmrg add_dep(last_grf_write[inst->src[i].nr * 16 + 117301e04c3fSmrg inst->src[i].offset / REG_SIZE + r], n); 117401e04c3fSmrg } 117501e04c3fSmrg } 117601e04c3fSmrg } else if (inst->src[i].file == FIXED_GRF) { 117701e04c3fSmrg if (post_reg_alloc) { 117801e04c3fSmrg for (unsigned r = 0; r < regs_read(inst, i); r++) 117901e04c3fSmrg add_dep(last_grf_write[inst->src[i].nr + r], n); 118001e04c3fSmrg } else { 118101e04c3fSmrg add_dep(last_fixed_grf_write, n); 118201e04c3fSmrg } 118301e04c3fSmrg } else if (inst->src[i].is_accumulator()) { 118401e04c3fSmrg add_dep(last_accumulator_write, n); 11857ec681f3Smrg } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 118601e04c3fSmrg add_barrier_deps(n); 118701e04c3fSmrg } 118801e04c3fSmrg } 118901e04c3fSmrg 119001e04c3fSmrg if (inst->base_mrf != -1) { 119101e04c3fSmrg for (int i = 0; i < inst->mlen; i++) { 119201e04c3fSmrg /* It looks like the MRF regs are released in the send 119301e04c3fSmrg * instruction once it's sent, not when the result comes 119401e04c3fSmrg * back. 119501e04c3fSmrg */ 119601e04c3fSmrg add_dep(last_mrf_write[inst->base_mrf + i], n); 119701e04c3fSmrg } 119801e04c3fSmrg } 119901e04c3fSmrg 120001e04c3fSmrg if (const unsigned mask = inst->flags_read(v->devinfo)) { 120101e04c3fSmrg assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 120201e04c3fSmrg 120301e04c3fSmrg for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 120401e04c3fSmrg if (mask & (1 << i)) 120501e04c3fSmrg add_dep(last_conditional_mod[i], n); 120601e04c3fSmrg } 120701e04c3fSmrg } 120801e04c3fSmrg 120901e04c3fSmrg if (inst->reads_accumulator_implicitly()) { 121001e04c3fSmrg add_dep(last_accumulator_write, n); 121101e04c3fSmrg } 121201e04c3fSmrg 121301e04c3fSmrg /* write-after-write deps. */ 121401e04c3fSmrg if (inst->dst.file == VGRF) { 121501e04c3fSmrg if (post_reg_alloc) { 121601e04c3fSmrg for (unsigned r = 0; r < regs_written(inst); r++) { 121701e04c3fSmrg add_dep(last_grf_write[inst->dst.nr + r], n); 121801e04c3fSmrg last_grf_write[inst->dst.nr + r] = n; 121901e04c3fSmrg } 122001e04c3fSmrg } else { 122101e04c3fSmrg for (unsigned r = 0; r < regs_written(inst); r++) { 122201e04c3fSmrg add_dep(last_grf_write[inst->dst.nr * 16 + 122301e04c3fSmrg inst->dst.offset / REG_SIZE + r], n); 122401e04c3fSmrg last_grf_write[inst->dst.nr * 16 + 122501e04c3fSmrg inst->dst.offset / REG_SIZE + r] = n; 122601e04c3fSmrg } 122701e04c3fSmrg } 122801e04c3fSmrg } else if (inst->dst.file == MRF) { 122901e04c3fSmrg int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 123001e04c3fSmrg 123101e04c3fSmrg add_dep(last_mrf_write[reg], n); 123201e04c3fSmrg last_mrf_write[reg] = n; 123301e04c3fSmrg if (is_compressed(inst)) { 123401e04c3fSmrg if (inst->dst.nr & BRW_MRF_COMPR4) 123501e04c3fSmrg reg += 4; 123601e04c3fSmrg else 123701e04c3fSmrg reg++; 123801e04c3fSmrg add_dep(last_mrf_write[reg], n); 123901e04c3fSmrg last_mrf_write[reg] = n; 124001e04c3fSmrg } 124101e04c3fSmrg } else if (inst->dst.file == FIXED_GRF) { 124201e04c3fSmrg if (post_reg_alloc) { 12437ec681f3Smrg for (unsigned r = 0; r < regs_written(inst); r++) { 12447ec681f3Smrg add_dep(last_grf_write[inst->dst.nr + r], n); 124501e04c3fSmrg last_grf_write[inst->dst.nr + r] = n; 12467ec681f3Smrg } 124701e04c3fSmrg } else { 12487ec681f3Smrg add_dep(last_fixed_grf_write, n); 124901e04c3fSmrg last_fixed_grf_write = n; 125001e04c3fSmrg } 125101e04c3fSmrg } else if (inst->dst.is_accumulator()) { 125201e04c3fSmrg add_dep(last_accumulator_write, n); 125301e04c3fSmrg last_accumulator_write = n; 125401e04c3fSmrg } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 125501e04c3fSmrg add_barrier_deps(n); 125601e04c3fSmrg } 125701e04c3fSmrg 125801e04c3fSmrg if (inst->mlen > 0 && inst->base_mrf != -1) { 12597ec681f3Smrg for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 126001e04c3fSmrg add_dep(last_mrf_write[inst->base_mrf + i], n); 126101e04c3fSmrg last_mrf_write[inst->base_mrf + i] = n; 126201e04c3fSmrg } 126301e04c3fSmrg } 126401e04c3fSmrg 12657ec681f3Smrg if (const unsigned mask = inst->flags_written(v->devinfo)) { 126601e04c3fSmrg assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 126701e04c3fSmrg 126801e04c3fSmrg for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 126901e04c3fSmrg if (mask & (1 << i)) { 127001e04c3fSmrg add_dep(last_conditional_mod[i], n, 0); 127101e04c3fSmrg last_conditional_mod[i] = n; 127201e04c3fSmrg } 127301e04c3fSmrg } 127401e04c3fSmrg } 127501e04c3fSmrg 127601e04c3fSmrg if (inst->writes_accumulator_implicitly(v->devinfo) && 127701e04c3fSmrg !inst->dst.is_accumulator()) { 127801e04c3fSmrg add_dep(last_accumulator_write, n); 127901e04c3fSmrg last_accumulator_write = n; 128001e04c3fSmrg } 128101e04c3fSmrg } 128201e04c3fSmrg 128301e04c3fSmrg /* bottom-to-top dependencies: WAR */ 12849f464c52Smaya memset(last_grf_write, 0, sizeof(schedule_node *) * grf_count * 16); 128501e04c3fSmrg memset(last_mrf_write, 0, sizeof(last_mrf_write)); 128601e04c3fSmrg memset(last_conditional_mod, 0, sizeof(last_conditional_mod)); 128701e04c3fSmrg last_accumulator_write = NULL; 128801e04c3fSmrg last_fixed_grf_write = NULL; 128901e04c3fSmrg 129001e04c3fSmrg foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 129101e04c3fSmrg fs_inst *inst = (fs_inst *)n->inst; 129201e04c3fSmrg 129301e04c3fSmrg /* write-after-read deps. */ 129401e04c3fSmrg for (int i = 0; i < inst->sources; i++) { 129501e04c3fSmrg if (inst->src[i].file == VGRF) { 129601e04c3fSmrg if (post_reg_alloc) { 129701e04c3fSmrg for (unsigned r = 0; r < regs_read(inst, i); r++) 129801e04c3fSmrg add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 129901e04c3fSmrg } else { 130001e04c3fSmrg for (unsigned r = 0; r < regs_read(inst, i); r++) { 130101e04c3fSmrg add_dep(n, last_grf_write[inst->src[i].nr * 16 + 130201e04c3fSmrg inst->src[i].offset / REG_SIZE + r], 0); 130301e04c3fSmrg } 130401e04c3fSmrg } 130501e04c3fSmrg } else if (inst->src[i].file == FIXED_GRF) { 130601e04c3fSmrg if (post_reg_alloc) { 130701e04c3fSmrg for (unsigned r = 0; r < regs_read(inst, i); r++) 130801e04c3fSmrg add_dep(n, last_grf_write[inst->src[i].nr + r], 0); 130901e04c3fSmrg } else { 131001e04c3fSmrg add_dep(n, last_fixed_grf_write, 0); 131101e04c3fSmrg } 131201e04c3fSmrg } else if (inst->src[i].is_accumulator()) { 131301e04c3fSmrg add_dep(n, last_accumulator_write, 0); 13147ec681f3Smrg } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 131501e04c3fSmrg add_barrier_deps(n); 131601e04c3fSmrg } 131701e04c3fSmrg } 131801e04c3fSmrg 131901e04c3fSmrg if (inst->base_mrf != -1) { 132001e04c3fSmrg for (int i = 0; i < inst->mlen; i++) { 132101e04c3fSmrg /* It looks like the MRF regs are released in the send 132201e04c3fSmrg * instruction once it's sent, not when the result comes 132301e04c3fSmrg * back. 132401e04c3fSmrg */ 132501e04c3fSmrg add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 132601e04c3fSmrg } 132701e04c3fSmrg } 132801e04c3fSmrg 132901e04c3fSmrg if (const unsigned mask = inst->flags_read(v->devinfo)) { 133001e04c3fSmrg assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 133101e04c3fSmrg 133201e04c3fSmrg for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 133301e04c3fSmrg if (mask & (1 << i)) 133401e04c3fSmrg add_dep(n, last_conditional_mod[i]); 133501e04c3fSmrg } 133601e04c3fSmrg } 133701e04c3fSmrg 133801e04c3fSmrg if (inst->reads_accumulator_implicitly()) { 133901e04c3fSmrg add_dep(n, last_accumulator_write); 134001e04c3fSmrg } 134101e04c3fSmrg 134201e04c3fSmrg /* Update the things this instruction wrote, so earlier reads 134301e04c3fSmrg * can mark this as WAR dependency. 134401e04c3fSmrg */ 134501e04c3fSmrg if (inst->dst.file == VGRF) { 134601e04c3fSmrg if (post_reg_alloc) { 134701e04c3fSmrg for (unsigned r = 0; r < regs_written(inst); r++) 134801e04c3fSmrg last_grf_write[inst->dst.nr + r] = n; 134901e04c3fSmrg } else { 135001e04c3fSmrg for (unsigned r = 0; r < regs_written(inst); r++) { 135101e04c3fSmrg last_grf_write[inst->dst.nr * 16 + 135201e04c3fSmrg inst->dst.offset / REG_SIZE + r] = n; 135301e04c3fSmrg } 135401e04c3fSmrg } 135501e04c3fSmrg } else if (inst->dst.file == MRF) { 135601e04c3fSmrg int reg = inst->dst.nr & ~BRW_MRF_COMPR4; 135701e04c3fSmrg 135801e04c3fSmrg last_mrf_write[reg] = n; 135901e04c3fSmrg 136001e04c3fSmrg if (is_compressed(inst)) { 136101e04c3fSmrg if (inst->dst.nr & BRW_MRF_COMPR4) 136201e04c3fSmrg reg += 4; 136301e04c3fSmrg else 136401e04c3fSmrg reg++; 136501e04c3fSmrg 136601e04c3fSmrg last_mrf_write[reg] = n; 136701e04c3fSmrg } 136801e04c3fSmrg } else if (inst->dst.file == FIXED_GRF) { 136901e04c3fSmrg if (post_reg_alloc) { 137001e04c3fSmrg for (unsigned r = 0; r < regs_written(inst); r++) 137101e04c3fSmrg last_grf_write[inst->dst.nr + r] = n; 137201e04c3fSmrg } else { 137301e04c3fSmrg last_fixed_grf_write = n; 137401e04c3fSmrg } 137501e04c3fSmrg } else if (inst->dst.is_accumulator()) { 137601e04c3fSmrg last_accumulator_write = n; 137701e04c3fSmrg } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 137801e04c3fSmrg add_barrier_deps(n); 137901e04c3fSmrg } 138001e04c3fSmrg 138101e04c3fSmrg if (inst->mlen > 0 && inst->base_mrf != -1) { 13827ec681f3Smrg for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 138301e04c3fSmrg last_mrf_write[inst->base_mrf + i] = n; 138401e04c3fSmrg } 138501e04c3fSmrg } 138601e04c3fSmrg 13877ec681f3Smrg if (const unsigned mask = inst->flags_written(v->devinfo)) { 138801e04c3fSmrg assert(mask < (1 << ARRAY_SIZE(last_conditional_mod))); 138901e04c3fSmrg 139001e04c3fSmrg for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) { 139101e04c3fSmrg if (mask & (1 << i)) 139201e04c3fSmrg last_conditional_mod[i] = n; 139301e04c3fSmrg } 139401e04c3fSmrg } 139501e04c3fSmrg 139601e04c3fSmrg if (inst->writes_accumulator_implicitly(v->devinfo)) { 139701e04c3fSmrg last_accumulator_write = n; 139801e04c3fSmrg } 139901e04c3fSmrg } 14009f464c52Smaya 14019f464c52Smaya free(last_grf_write); 140201e04c3fSmrg} 140301e04c3fSmrg 140401e04c3fSmrgvoid 140501e04c3fSmrgvec4_instruction_scheduler::calculate_deps() 140601e04c3fSmrg{ 140701e04c3fSmrg schedule_node *last_grf_write[grf_count]; 14087ec681f3Smrg schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)]; 140901e04c3fSmrg schedule_node *last_conditional_mod = NULL; 141001e04c3fSmrg schedule_node *last_accumulator_write = NULL; 141101e04c3fSmrg /* Fixed HW registers are assumed to be separate from the virtual 141201e04c3fSmrg * GRFs, so they can be tracked separately. We don't really write 141301e04c3fSmrg * to fixed GRFs much, so don't bother tracking them on a more 141401e04c3fSmrg * granular level. 141501e04c3fSmrg */ 141601e04c3fSmrg schedule_node *last_fixed_grf_write = NULL; 141701e04c3fSmrg 141801e04c3fSmrg memset(last_grf_write, 0, sizeof(last_grf_write)); 141901e04c3fSmrg memset(last_mrf_write, 0, sizeof(last_mrf_write)); 142001e04c3fSmrg 142101e04c3fSmrg /* top-to-bottom dependencies: RAW and WAW. */ 142201e04c3fSmrg foreach_in_list(schedule_node, n, &instructions) { 142301e04c3fSmrg vec4_instruction *inst = (vec4_instruction *)n->inst; 142401e04c3fSmrg 142501e04c3fSmrg if (is_scheduling_barrier(inst)) 142601e04c3fSmrg add_barrier_deps(n); 142701e04c3fSmrg 142801e04c3fSmrg /* read-after-write deps. */ 142901e04c3fSmrg for (int i = 0; i < 3; i++) { 143001e04c3fSmrg if (inst->src[i].file == VGRF) { 143101e04c3fSmrg for (unsigned j = 0; j < regs_read(inst, i); ++j) 143201e04c3fSmrg add_dep(last_grf_write[inst->src[i].nr + j], n); 143301e04c3fSmrg } else if (inst->src[i].file == FIXED_GRF) { 143401e04c3fSmrg add_dep(last_fixed_grf_write, n); 143501e04c3fSmrg } else if (inst->src[i].is_accumulator()) { 143601e04c3fSmrg assert(last_accumulator_write); 143701e04c3fSmrg add_dep(last_accumulator_write, n); 14387ec681f3Smrg } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 143901e04c3fSmrg add_barrier_deps(n); 144001e04c3fSmrg } 144101e04c3fSmrg } 144201e04c3fSmrg 144301e04c3fSmrg if (inst->reads_g0_implicitly()) 144401e04c3fSmrg add_dep(last_fixed_grf_write, n); 144501e04c3fSmrg 144601e04c3fSmrg if (!inst->is_send_from_grf()) { 144701e04c3fSmrg for (int i = 0; i < inst->mlen; i++) { 144801e04c3fSmrg /* It looks like the MRF regs are released in the send 144901e04c3fSmrg * instruction once it's sent, not when the result comes 145001e04c3fSmrg * back. 145101e04c3fSmrg */ 145201e04c3fSmrg add_dep(last_mrf_write[inst->base_mrf + i], n); 145301e04c3fSmrg } 145401e04c3fSmrg } 145501e04c3fSmrg 145601e04c3fSmrg if (inst->reads_flag()) { 145701e04c3fSmrg assert(last_conditional_mod); 145801e04c3fSmrg add_dep(last_conditional_mod, n); 145901e04c3fSmrg } 146001e04c3fSmrg 146101e04c3fSmrg if (inst->reads_accumulator_implicitly()) { 146201e04c3fSmrg assert(last_accumulator_write); 146301e04c3fSmrg add_dep(last_accumulator_write, n); 146401e04c3fSmrg } 146501e04c3fSmrg 146601e04c3fSmrg /* write-after-write deps. */ 146701e04c3fSmrg if (inst->dst.file == VGRF) { 146801e04c3fSmrg for (unsigned j = 0; j < regs_written(inst); ++j) { 146901e04c3fSmrg add_dep(last_grf_write[inst->dst.nr + j], n); 147001e04c3fSmrg last_grf_write[inst->dst.nr + j] = n; 147101e04c3fSmrg } 147201e04c3fSmrg } else if (inst->dst.file == MRF) { 147301e04c3fSmrg add_dep(last_mrf_write[inst->dst.nr], n); 147401e04c3fSmrg last_mrf_write[inst->dst.nr] = n; 147501e04c3fSmrg } else if (inst->dst.file == FIXED_GRF) { 14767ec681f3Smrg add_dep(last_fixed_grf_write, n); 147701e04c3fSmrg last_fixed_grf_write = n; 147801e04c3fSmrg } else if (inst->dst.is_accumulator()) { 147901e04c3fSmrg add_dep(last_accumulator_write, n); 148001e04c3fSmrg last_accumulator_write = n; 148101e04c3fSmrg } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 148201e04c3fSmrg add_barrier_deps(n); 148301e04c3fSmrg } 148401e04c3fSmrg 148501e04c3fSmrg if (inst->mlen > 0 && !inst->is_send_from_grf()) { 14867ec681f3Smrg for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 148701e04c3fSmrg add_dep(last_mrf_write[inst->base_mrf + i], n); 148801e04c3fSmrg last_mrf_write[inst->base_mrf + i] = n; 148901e04c3fSmrg } 149001e04c3fSmrg } 149101e04c3fSmrg 14927ec681f3Smrg if (inst->writes_flag(v->devinfo)) { 149301e04c3fSmrg add_dep(last_conditional_mod, n, 0); 149401e04c3fSmrg last_conditional_mod = n; 149501e04c3fSmrg } 149601e04c3fSmrg 149701e04c3fSmrg if (inst->writes_accumulator_implicitly(v->devinfo) && 149801e04c3fSmrg !inst->dst.is_accumulator()) { 149901e04c3fSmrg add_dep(last_accumulator_write, n); 150001e04c3fSmrg last_accumulator_write = n; 150101e04c3fSmrg } 150201e04c3fSmrg } 150301e04c3fSmrg 150401e04c3fSmrg /* bottom-to-top dependencies: WAR */ 150501e04c3fSmrg memset(last_grf_write, 0, sizeof(last_grf_write)); 150601e04c3fSmrg memset(last_mrf_write, 0, sizeof(last_mrf_write)); 150701e04c3fSmrg last_conditional_mod = NULL; 150801e04c3fSmrg last_accumulator_write = NULL; 150901e04c3fSmrg last_fixed_grf_write = NULL; 151001e04c3fSmrg 151101e04c3fSmrg foreach_in_list_reverse_safe(schedule_node, n, &instructions) { 151201e04c3fSmrg vec4_instruction *inst = (vec4_instruction *)n->inst; 151301e04c3fSmrg 151401e04c3fSmrg /* write-after-read deps. */ 151501e04c3fSmrg for (int i = 0; i < 3; i++) { 151601e04c3fSmrg if (inst->src[i].file == VGRF) { 151701e04c3fSmrg for (unsigned j = 0; j < regs_read(inst, i); ++j) 151801e04c3fSmrg add_dep(n, last_grf_write[inst->src[i].nr + j]); 151901e04c3fSmrg } else if (inst->src[i].file == FIXED_GRF) { 152001e04c3fSmrg add_dep(n, last_fixed_grf_write); 152101e04c3fSmrg } else if (inst->src[i].is_accumulator()) { 152201e04c3fSmrg add_dep(n, last_accumulator_write); 15237ec681f3Smrg } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { 152401e04c3fSmrg add_barrier_deps(n); 152501e04c3fSmrg } 152601e04c3fSmrg } 152701e04c3fSmrg 152801e04c3fSmrg if (!inst->is_send_from_grf()) { 152901e04c3fSmrg for (int i = 0; i < inst->mlen; i++) { 153001e04c3fSmrg /* It looks like the MRF regs are released in the send 153101e04c3fSmrg * instruction once it's sent, not when the result comes 153201e04c3fSmrg * back. 153301e04c3fSmrg */ 153401e04c3fSmrg add_dep(n, last_mrf_write[inst->base_mrf + i], 2); 153501e04c3fSmrg } 153601e04c3fSmrg } 153701e04c3fSmrg 153801e04c3fSmrg if (inst->reads_flag()) { 153901e04c3fSmrg add_dep(n, last_conditional_mod); 154001e04c3fSmrg } 154101e04c3fSmrg 154201e04c3fSmrg if (inst->reads_accumulator_implicitly()) { 154301e04c3fSmrg add_dep(n, last_accumulator_write); 154401e04c3fSmrg } 154501e04c3fSmrg 154601e04c3fSmrg /* Update the things this instruction wrote, so earlier reads 154701e04c3fSmrg * can mark this as WAR dependency. 154801e04c3fSmrg */ 154901e04c3fSmrg if (inst->dst.file == VGRF) { 155001e04c3fSmrg for (unsigned j = 0; j < regs_written(inst); ++j) 155101e04c3fSmrg last_grf_write[inst->dst.nr + j] = n; 155201e04c3fSmrg } else if (inst->dst.file == MRF) { 155301e04c3fSmrg last_mrf_write[inst->dst.nr] = n; 155401e04c3fSmrg } else if (inst->dst.file == FIXED_GRF) { 155501e04c3fSmrg last_fixed_grf_write = n; 155601e04c3fSmrg } else if (inst->dst.is_accumulator()) { 155701e04c3fSmrg last_accumulator_write = n; 155801e04c3fSmrg } else if (inst->dst.file == ARF && !inst->dst.is_null()) { 155901e04c3fSmrg add_barrier_deps(n); 156001e04c3fSmrg } 156101e04c3fSmrg 156201e04c3fSmrg if (inst->mlen > 0 && !inst->is_send_from_grf()) { 15637ec681f3Smrg for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { 156401e04c3fSmrg last_mrf_write[inst->base_mrf + i] = n; 156501e04c3fSmrg } 156601e04c3fSmrg } 156701e04c3fSmrg 15687ec681f3Smrg if (inst->writes_flag(v->devinfo)) { 156901e04c3fSmrg last_conditional_mod = n; 157001e04c3fSmrg } 157101e04c3fSmrg 157201e04c3fSmrg if (inst->writes_accumulator_implicitly(v->devinfo)) { 157301e04c3fSmrg last_accumulator_write = n; 157401e04c3fSmrg } 157501e04c3fSmrg } 157601e04c3fSmrg} 157701e04c3fSmrg 157801e04c3fSmrgschedule_node * 157901e04c3fSmrgfs_instruction_scheduler::choose_instruction_to_schedule() 158001e04c3fSmrg{ 158101e04c3fSmrg schedule_node *chosen = NULL; 158201e04c3fSmrg 158301e04c3fSmrg if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) { 158401e04c3fSmrg int chosen_time = 0; 158501e04c3fSmrg 158601e04c3fSmrg /* Of the instructions ready to execute or the closest to being ready, 158701e04c3fSmrg * choose the one most likely to unblock an early program exit, or 158801e04c3fSmrg * otherwise the oldest one. 158901e04c3fSmrg */ 159001e04c3fSmrg foreach_in_list(schedule_node, n, &instructions) { 159101e04c3fSmrg if (!chosen || 159201e04c3fSmrg exit_unblocked_time(n) < exit_unblocked_time(chosen) || 159301e04c3fSmrg (exit_unblocked_time(n) == exit_unblocked_time(chosen) && 159401e04c3fSmrg n->unblocked_time < chosen_time)) { 159501e04c3fSmrg chosen = n; 159601e04c3fSmrg chosen_time = n->unblocked_time; 159701e04c3fSmrg } 159801e04c3fSmrg } 159901e04c3fSmrg } else { 16007ec681f3Smrg int chosen_register_pressure_benefit = 0; 16017ec681f3Smrg 160201e04c3fSmrg /* Before register allocation, we don't care about the latencies of 160301e04c3fSmrg * instructions. All we care about is reducing live intervals of 160401e04c3fSmrg * variables so that we can avoid register spilling, or get SIMD16 160501e04c3fSmrg * shaders which naturally do a better job of hiding instruction 160601e04c3fSmrg * latency. 160701e04c3fSmrg */ 160801e04c3fSmrg foreach_in_list(schedule_node, n, &instructions) { 160901e04c3fSmrg fs_inst *inst = (fs_inst *)n->inst; 161001e04c3fSmrg 161101e04c3fSmrg if (!chosen) { 161201e04c3fSmrg chosen = n; 16137ec681f3Smrg chosen_register_pressure_benefit = 16147ec681f3Smrg get_register_pressure_benefit(chosen->inst); 161501e04c3fSmrg continue; 161601e04c3fSmrg } 161701e04c3fSmrg 161801e04c3fSmrg /* Most important: If we can definitely reduce register pressure, do 161901e04c3fSmrg * so immediately. 162001e04c3fSmrg */ 162101e04c3fSmrg int register_pressure_benefit = get_register_pressure_benefit(n->inst); 162201e04c3fSmrg 162301e04c3fSmrg if (register_pressure_benefit > 0 && 162401e04c3fSmrg register_pressure_benefit > chosen_register_pressure_benefit) { 162501e04c3fSmrg chosen = n; 16267ec681f3Smrg chosen_register_pressure_benefit = register_pressure_benefit; 162701e04c3fSmrg continue; 162801e04c3fSmrg } else if (chosen_register_pressure_benefit > 0 && 162901e04c3fSmrg (register_pressure_benefit < 163001e04c3fSmrg chosen_register_pressure_benefit)) { 163101e04c3fSmrg continue; 163201e04c3fSmrg } 163301e04c3fSmrg 163401e04c3fSmrg if (mode == SCHEDULE_PRE_LIFO) { 163501e04c3fSmrg /* Prefer instructions that recently became available for 163601e04c3fSmrg * scheduling. These are the things that are most likely to 163701e04c3fSmrg * (eventually) make a variable dead and reduce register pressure. 163801e04c3fSmrg * Typical register pressure estimates don't work for us because 163901e04c3fSmrg * most of our pressure comes from texturing, where no single 164001e04c3fSmrg * instruction to schedule will make a vec4 value dead. 164101e04c3fSmrg */ 164201e04c3fSmrg if (n->cand_generation > chosen->cand_generation) { 164301e04c3fSmrg chosen = n; 16447ec681f3Smrg chosen_register_pressure_benefit = register_pressure_benefit; 164501e04c3fSmrg continue; 164601e04c3fSmrg } else if (n->cand_generation < chosen->cand_generation) { 164701e04c3fSmrg continue; 164801e04c3fSmrg } 164901e04c3fSmrg 165001e04c3fSmrg /* On MRF-using chips, prefer non-SEND instructions. If we don't 165101e04c3fSmrg * do this, then because we prefer instructions that just became 165201e04c3fSmrg * candidates, we'll end up in a pattern of scheduling a SEND, 165301e04c3fSmrg * then the MRFs for the next SEND, then the next SEND, then the 165401e04c3fSmrg * MRFs, etc., without ever consuming the results of a send. 165501e04c3fSmrg */ 16567ec681f3Smrg if (v->devinfo->ver < 7) { 165701e04c3fSmrg fs_inst *chosen_inst = (fs_inst *)chosen->inst; 165801e04c3fSmrg 165901e04c3fSmrg /* We use size_written > 4 * exec_size as our test for the kind 166001e04c3fSmrg * of send instruction to avoid -- only sends generate many 166101e04c3fSmrg * regs, and a single-result send is probably actually reducing 166201e04c3fSmrg * register pressure. 166301e04c3fSmrg */ 166401e04c3fSmrg if (inst->size_written <= 4 * inst->exec_size && 166501e04c3fSmrg chosen_inst->size_written > 4 * chosen_inst->exec_size) { 166601e04c3fSmrg chosen = n; 16677ec681f3Smrg chosen_register_pressure_benefit = register_pressure_benefit; 166801e04c3fSmrg continue; 166901e04c3fSmrg } else if (inst->size_written > chosen_inst->size_written) { 167001e04c3fSmrg continue; 167101e04c3fSmrg } 167201e04c3fSmrg } 167301e04c3fSmrg } 167401e04c3fSmrg 167501e04c3fSmrg /* For instructions pushed on the cands list at the same time, prefer 167601e04c3fSmrg * the one with the highest delay to the end of the program. This is 167701e04c3fSmrg * most likely to have its values able to be consumed first (such as 167801e04c3fSmrg * for a large tree of lowered ubo loads, which appear reversed in 167901e04c3fSmrg * the instruction stream with respect to when they can be consumed). 168001e04c3fSmrg */ 168101e04c3fSmrg if (n->delay > chosen->delay) { 168201e04c3fSmrg chosen = n; 16837ec681f3Smrg chosen_register_pressure_benefit = register_pressure_benefit; 168401e04c3fSmrg continue; 168501e04c3fSmrg } else if (n->delay < chosen->delay) { 168601e04c3fSmrg continue; 168701e04c3fSmrg } 168801e04c3fSmrg 168901e04c3fSmrg /* Prefer the node most likely to unblock an early program exit. 169001e04c3fSmrg */ 169101e04c3fSmrg if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) { 169201e04c3fSmrg chosen = n; 16937ec681f3Smrg chosen_register_pressure_benefit = register_pressure_benefit; 169401e04c3fSmrg continue; 169501e04c3fSmrg } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) { 169601e04c3fSmrg continue; 169701e04c3fSmrg } 169801e04c3fSmrg 169901e04c3fSmrg /* If all other metrics are equal, we prefer the first instruction in 170001e04c3fSmrg * the list (program execution). 170101e04c3fSmrg */ 170201e04c3fSmrg } 170301e04c3fSmrg } 170401e04c3fSmrg 170501e04c3fSmrg return chosen; 170601e04c3fSmrg} 170701e04c3fSmrg 170801e04c3fSmrgschedule_node * 170901e04c3fSmrgvec4_instruction_scheduler::choose_instruction_to_schedule() 171001e04c3fSmrg{ 171101e04c3fSmrg schedule_node *chosen = NULL; 171201e04c3fSmrg int chosen_time = 0; 171301e04c3fSmrg 171401e04c3fSmrg /* Of the instructions ready to execute or the closest to being ready, 171501e04c3fSmrg * choose the oldest one. 171601e04c3fSmrg */ 171701e04c3fSmrg foreach_in_list(schedule_node, n, &instructions) { 171801e04c3fSmrg if (!chosen || n->unblocked_time < chosen_time) { 171901e04c3fSmrg chosen = n; 172001e04c3fSmrg chosen_time = n->unblocked_time; 172101e04c3fSmrg } 172201e04c3fSmrg } 172301e04c3fSmrg 172401e04c3fSmrg return chosen; 172501e04c3fSmrg} 172601e04c3fSmrg 172701e04c3fSmrgint 17287ec681f3Smrgfs_instruction_scheduler::issue_time(backend_instruction *inst0) 172901e04c3fSmrg{ 17307ec681f3Smrg const fs_inst *inst = static_cast<fs_inst *>(inst0); 17317ec681f3Smrg const unsigned overhead = v->grf_used && has_bank_conflict(v->devinfo, inst) ? 17327ec681f3Smrg DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE) : 0; 17337ec681f3Smrg if (is_compressed(inst)) 173401e04c3fSmrg return 4 + overhead; 173501e04c3fSmrg else 173601e04c3fSmrg return 2 + overhead; 173701e04c3fSmrg} 173801e04c3fSmrg 173901e04c3fSmrgint 174001e04c3fSmrgvec4_instruction_scheduler::issue_time(backend_instruction *) 174101e04c3fSmrg{ 174201e04c3fSmrg /* We always execute as two vec4s in parallel. */ 174301e04c3fSmrg return 2; 174401e04c3fSmrg} 174501e04c3fSmrg 174601e04c3fSmrgvoid 174701e04c3fSmrginstruction_scheduler::schedule_instructions(bblock_t *block) 174801e04c3fSmrg{ 17497ec681f3Smrg const struct intel_device_info *devinfo = bs->devinfo; 175001e04c3fSmrg int time = 0; 17517ec681f3Smrg int instructions_to_schedule = block->end_ip - block->start_ip + 1; 17527ec681f3Smrg 175301e04c3fSmrg if (!post_reg_alloc) 175401e04c3fSmrg reg_pressure = reg_pressure_in[block->num]; 175501e04c3fSmrg block_idx = block->num; 175601e04c3fSmrg 175701e04c3fSmrg /* Remove non-DAG heads from the list. */ 175801e04c3fSmrg foreach_in_list_safe(schedule_node, n, &instructions) { 175901e04c3fSmrg if (n->parent_count != 0) 176001e04c3fSmrg n->remove(); 176101e04c3fSmrg } 176201e04c3fSmrg 176301e04c3fSmrg unsigned cand_generation = 1; 176401e04c3fSmrg while (!instructions.is_empty()) { 176501e04c3fSmrg schedule_node *chosen = choose_instruction_to_schedule(); 176601e04c3fSmrg 176701e04c3fSmrg /* Schedule this instruction. */ 176801e04c3fSmrg assert(chosen); 176901e04c3fSmrg chosen->remove(); 177001e04c3fSmrg chosen->inst->exec_node::remove(); 177101e04c3fSmrg block->instructions.push_tail(chosen->inst); 177201e04c3fSmrg instructions_to_schedule--; 177301e04c3fSmrg 177401e04c3fSmrg if (!post_reg_alloc) { 177501e04c3fSmrg reg_pressure -= get_register_pressure_benefit(chosen->inst); 177601e04c3fSmrg update_register_pressure(chosen->inst); 177701e04c3fSmrg } 177801e04c3fSmrg 177901e04c3fSmrg /* If we expected a delay for scheduling, then bump the clock to reflect 178001e04c3fSmrg * that. In reality, the hardware will switch to another hyperthread 178101e04c3fSmrg * and may not return to dispatching our thread for a while even after 178201e04c3fSmrg * we're unblocked. After this, we have the time when the chosen 178301e04c3fSmrg * instruction will start executing. 178401e04c3fSmrg */ 178501e04c3fSmrg time = MAX2(time, chosen->unblocked_time); 178601e04c3fSmrg 178701e04c3fSmrg /* Update the clock for how soon an instruction could start after the 178801e04c3fSmrg * chosen one. 178901e04c3fSmrg */ 179001e04c3fSmrg time += issue_time(chosen->inst); 179101e04c3fSmrg 179201e04c3fSmrg if (debug) { 179301e04c3fSmrg fprintf(stderr, "clock %4d, scheduled: ", time); 179401e04c3fSmrg bs->dump_instruction(chosen->inst); 179501e04c3fSmrg if (!post_reg_alloc) 179601e04c3fSmrg fprintf(stderr, "(register pressure %d)\n", reg_pressure); 179701e04c3fSmrg } 179801e04c3fSmrg 179901e04c3fSmrg /* Now that we've scheduled a new instruction, some of its 180001e04c3fSmrg * children can be promoted to the list of instructions ready to 180101e04c3fSmrg * be scheduled. Update the children's unblocked time for this 180201e04c3fSmrg * DAG edge as we do so. 180301e04c3fSmrg */ 180401e04c3fSmrg for (int i = chosen->child_count - 1; i >= 0; i--) { 180501e04c3fSmrg schedule_node *child = chosen->children[i]; 180601e04c3fSmrg 180701e04c3fSmrg child->unblocked_time = MAX2(child->unblocked_time, 180801e04c3fSmrg time + chosen->child_latency[i]); 180901e04c3fSmrg 181001e04c3fSmrg if (debug) { 181101e04c3fSmrg fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count); 181201e04c3fSmrg bs->dump_instruction(child->inst); 181301e04c3fSmrg } 181401e04c3fSmrg 181501e04c3fSmrg child->cand_generation = cand_generation; 181601e04c3fSmrg child->parent_count--; 181701e04c3fSmrg if (child->parent_count == 0) { 181801e04c3fSmrg if (debug) { 181901e04c3fSmrg fprintf(stderr, "\t\tnow available\n"); 182001e04c3fSmrg } 182101e04c3fSmrg instructions.push_head(child); 182201e04c3fSmrg } 182301e04c3fSmrg } 182401e04c3fSmrg cand_generation++; 182501e04c3fSmrg 18267ec681f3Smrg /* Shared resource: the mathbox. There's one mathbox per EU on Gfx6+ 18277ec681f3Smrg * but it's more limited pre-gfx6, so if we send something off to it then 182801e04c3fSmrg * the next math instruction isn't going to make progress until the first 182901e04c3fSmrg * is done. 183001e04c3fSmrg */ 18317ec681f3Smrg if (devinfo->ver < 6 && chosen->inst->is_math()) { 183201e04c3fSmrg foreach_in_list(schedule_node, n, &instructions) { 183301e04c3fSmrg if (n->inst->is_math()) 183401e04c3fSmrg n->unblocked_time = MAX2(n->unblocked_time, 183501e04c3fSmrg time + chosen->latency); 183601e04c3fSmrg } 183701e04c3fSmrg } 183801e04c3fSmrg } 183901e04c3fSmrg 184001e04c3fSmrg assert(instructions_to_schedule == 0); 184101e04c3fSmrg} 184201e04c3fSmrg 184301e04c3fSmrgvoid 184401e04c3fSmrginstruction_scheduler::run(cfg_t *cfg) 184501e04c3fSmrg{ 184601e04c3fSmrg if (debug && !post_reg_alloc) { 184701e04c3fSmrg fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", 184801e04c3fSmrg post_reg_alloc); 184901e04c3fSmrg bs->dump_instructions(); 185001e04c3fSmrg } 185101e04c3fSmrg 185201e04c3fSmrg if (!post_reg_alloc) 185301e04c3fSmrg setup_liveness(cfg); 185401e04c3fSmrg 185501e04c3fSmrg foreach_block(block, cfg) { 185601e04c3fSmrg if (reads_remaining) { 185701e04c3fSmrg memset(reads_remaining, 0, 185801e04c3fSmrg grf_count * sizeof(*reads_remaining)); 185901e04c3fSmrg memset(hw_reads_remaining, 0, 186001e04c3fSmrg hw_reg_count * sizeof(*hw_reads_remaining)); 186101e04c3fSmrg memset(written, 0, grf_count * sizeof(*written)); 186201e04c3fSmrg 186301e04c3fSmrg foreach_inst_in_block(fs_inst, inst, block) 186401e04c3fSmrg count_reads_remaining(inst); 186501e04c3fSmrg } 186601e04c3fSmrg 186701e04c3fSmrg add_insts_from_block(block); 186801e04c3fSmrg 186901e04c3fSmrg calculate_deps(); 187001e04c3fSmrg 187101e04c3fSmrg compute_delays(); 187201e04c3fSmrg compute_exits(); 187301e04c3fSmrg 187401e04c3fSmrg schedule_instructions(block); 187501e04c3fSmrg } 187601e04c3fSmrg 187701e04c3fSmrg if (debug && !post_reg_alloc) { 187801e04c3fSmrg fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", 187901e04c3fSmrg post_reg_alloc); 188001e04c3fSmrg bs->dump_instructions(); 188101e04c3fSmrg } 188201e04c3fSmrg} 188301e04c3fSmrg 188401e04c3fSmrgvoid 188501e04c3fSmrgfs_visitor::schedule_instructions(instruction_scheduler_mode mode) 188601e04c3fSmrg{ 188701e04c3fSmrg int grf_count; 188801e04c3fSmrg if (mode == SCHEDULE_POST) 188901e04c3fSmrg grf_count = grf_used; 189001e04c3fSmrg else 189101e04c3fSmrg grf_count = alloc.count; 189201e04c3fSmrg 189301e04c3fSmrg fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, 189401e04c3fSmrg cfg->num_blocks, mode); 189501e04c3fSmrg sched.run(cfg); 189601e04c3fSmrg 18977ec681f3Smrg invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 189801e04c3fSmrg} 189901e04c3fSmrg 190001e04c3fSmrgvoid 190101e04c3fSmrgvec4_visitor::opt_schedule_instructions() 190201e04c3fSmrg{ 190301e04c3fSmrg vec4_instruction_scheduler sched(this, prog_data->total_grf); 190401e04c3fSmrg sched.run(cfg); 190501e04c3fSmrg 19067ec681f3Smrg invalidate_analysis(DEPENDENCY_INSTRUCTIONS); 190701e04c3fSmrg} 1908