1/*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_eu.h"
25#include "brw_fs.h"
26#include "brw_vec4.h"
27#include "brw_cfg.h"
28
29using namespace brw;
30
31namespace {
32   /**
33    * Enumeration representing the various asynchronous units that can run
34    * computations in parallel on behalf of a shader thread.
35    */
36   enum unit {
37      /** EU front-end. */
38      unit_fe,
39      /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40      unit_fpu,
41      /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42      unit_em,
43      /** Sampler shared function. */
44      unit_sampler,
45      /** Pixel Interpolator shared function. */
46      unit_pi,
47      /** Unified Return Buffer shared function. */
48      unit_urb,
49      /** Data Port Data Cache shared function. */
50      unit_dp_dc,
51      /** Data Port Render Cache shared function. */
52      unit_dp_rc,
53      /** Data Port Constant Cache shared function. */
54      unit_dp_cc,
55      /** Message Gateway shared function. */
56      unit_gateway,
57      /** Thread Spawner shared function. */
58      unit_spawner,
59      /* unit_vme, */
60      /* unit_cre, */
61      /** Number of asynchronous units currently tracked. */
62      num_units,
63      /** Dummy unit for instructions that don't consume runtime from the above. */
64      unit_null = num_units
65   };
66
67   /**
68    * Enumeration representing a computation result another computation can
69    * potentially depend on.
70    */
71   enum dependency_id {
72      /* Register part of the GRF. */
73      dependency_id_grf0 = 0,
74      /* Register part of the MRF.  Only used on Gfx4-6. */
75      dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF,
76      /* Address register part of the ARF. */
77      dependency_id_addr0 = dependency_id_mrf0 + 24,
78      /* Accumulator register part of the ARF. */
79      dependency_id_accum0 = dependency_id_addr0 + 1,
80      /* Flag register part of the ARF. */
81      dependency_id_flag0 = dependency_id_accum0 + 12,
82      /* SBID token write completion.  Only used on Gfx12+. */
83      dependency_id_sbid_wr0 = dependency_id_flag0 + 8,
84      /* SBID token read completion.  Only used on Gfx12+. */
85      dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16,
86      /* Number of computation dependencies currently tracked. */
87      num_dependency_ids = dependency_id_sbid_rd0 + 16
88   };
89
90   /**
91    * State of our modeling of the program execution.
92    */
93   struct state {
94      state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95      /**
96       * Time at which a given unit will be ready to execute the next
97       * computation, in clock units.
98       */
99      unsigned unit_ready[num_units];
100      /**
101       * Time at which an instruction dependent on a given dependency ID will
102       * be ready to execute, in clock units.
103       */
104      unsigned dep_ready[num_dependency_ids];
105      /**
106       * Aggregated utilization of a given unit excluding idle cycles,
107       * in clock units.
108       */
109      float unit_busy[num_units];
110      /**
111       * Factor of the overhead of a computation accounted for in the
112       * aggregated utilization calculation.
113       */
114      float weight;
115   };
116
117   /**
118    * Information derived from an IR instruction used to compute performance
119    * estimates.  Allows the timing calculation to work on both FS and VEC4
120    * instructions.
121    */
122   struct instruction_info {
123      instruction_info(const intel_device_info *devinfo, const fs_inst *inst) :
124         devinfo(devinfo), op(inst->opcode),
125         td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126         tx(get_exec_type(inst)), sx(0), ss(0),
127         sc(has_bank_conflict(devinfo, inst) ? sd : 0),
128         desc(inst->desc), sfid(inst->sfid)
129      {
130         /* We typically want the maximum source size, except for split send
131          * messages which require the total size.
132          */
133         if (inst->opcode == SHADER_OPCODE_SEND) {
134            ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135                 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136         } else {
137            for (unsigned i = 0; i < inst->sources; i++)
138               ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139         }
140
141         /* Convert the execution size to GRF units. */
142         sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143
144         /* 32x32 integer multiplication has half the usual ALU throughput.
145          * Treat it as double-precision.
146          */
147         if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148             !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149             type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150            tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151      }
152
153      instruction_info(const intel_device_info *devinfo,
154                       const vec4_instruction *inst) :
155         devinfo(devinfo), op(inst->opcode),
156         td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157         tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158         desc(inst->desc), sfid(inst->sfid)
159      {
160         /* Compute the maximum source size. */
161         for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162            ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163
164         /* Convert the execution size to GRF units. */
165         sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166
167         /* 32x32 integer multiplication has half the usual ALU throughput.
168          * Treat it as double-precision.
169          */
170         if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171             !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172             type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173            tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174      }
175
176      /** Device information. */
177      const struct intel_device_info *devinfo;
178      /** Instruction opcode. */
179      opcode op;
180      /** Destination type. */
181      brw_reg_type td;
182      /** Destination size in GRF units. */
183      unsigned sd;
184      /** Execution type. */
185      brw_reg_type tx;
186      /** Execution size in GRF units. */
187      unsigned sx;
188      /** Source size. */
189      unsigned ss;
190      /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
191      unsigned sc;
192      /** Send message descriptor. */
193      uint32_t desc;
194      /** Send message shared function ID. */
195      uint8_t sfid;
196   };
197
198   /**
199    * Timing information of an instruction used to estimate the performance of
200    * the program.
201    */
202   struct perf_desc {
203      perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) :
204         u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
205
206      /**
207       * Back-end unit its runtime shall be accounted to, in addition to the
208       * EU front-end which is always assumed to be involved.
209       */
210      unit u;
211      /**
212       * Overhead cycles from the time that the EU front-end starts executing
213       * the instruction until it's ready to execute the next instruction.
214       */
215      int df;
216      /**
217       * Overhead cycles from the time that the back-end starts executing the
218       * instruction until it's ready to execute the next instruction.
219       */
220      int db;
221      /**
222       * Latency cycles from the time that the back-end starts executing the
223       * instruction until its sources have been read from the register file.
224       */
225      int ls;
226      /**
227       * Latency cycles from the time that the back-end starts executing the
228       * instruction until its regular destination has been written to the
229       * register file.
230       */
231      int ld;
232      /**
233       * Latency cycles from the time that the back-end starts executing the
234       * instruction until its accumulator destination has been written to the
235       * ARF file.
236       *
237       * Note that this is an approximation of the real behavior of
238       * accumulating instructions in the hardware: Instead of modeling a pair
239       * of back-to-back accumulating instructions as a first computation with
240       * latency equal to ld followed by another computation with a
241       * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242       * model the stall as if it occurred at the top of the pipeline, with
243       * the latency of the accumulator computation offset accordingly.
244       */
245      int la;
246      /**
247       * Latency cycles from the time that the back-end starts executing the
248       * instruction until its flag destination has been written to the ARF
249       * file.
250       */
251      int lf;
252   };
253
254   /**
255    * Compute the timing information of an instruction based on any relevant
256    * information from the IR and a number of parameters specifying a linear
257    * approximation: Parameter X_Y specifies the derivative of timing X
258    * relative to info field Y, while X_1 specifies the independent term of
259    * the approximation of timing X.
260    */
261   perf_desc
262   calculate_desc(const instruction_info &info, unit u,
263                  int df_1, int df_sd, int df_sc,
264                  int db_1, int db_sx,
265                  int ls_1, int ld_1, int la_1, int lf_1,
266                  int l_ss, int l_sd)
267   {
268      return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
269                          db_1 + db_sx * int(info.sx),
270                          ls_1 + l_ss * int(info.ss),
271                          ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
272                          la_1, lf_1);
273   }
274
275   /**
276    * Compute the timing information of an instruction based on any relevant
277    * information from the IR and a number of linear approximation parameters
278    * hard-coded for each IR instruction.
279    *
280    * Most timing parameters are obtained from the multivariate linear
281    * regression of a sample of empirical timings measured using the tm0
282    * register (as can be done today by using the shader_time debugging
283    * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
284    * "Shared Functions - Extended Math", Section 3.2 "Performance".
285    * Parameters marked XXX shall be considered low-quality, they're possibly
286    * high variance or completely guessed in cases where experimental data was
287    * unavailable.
288    */
289   const perf_desc
290   instruction_desc(const instruction_info &info)
291   {
292      const struct intel_device_info *devinfo = info.devinfo;
293
294      switch (info.op) {
295      case BRW_OPCODE_SYNC:
296      case BRW_OPCODE_SEL:
297      case BRW_OPCODE_NOT:
298      case BRW_OPCODE_AND:
299      case BRW_OPCODE_OR:
300      case BRW_OPCODE_XOR:
301      case BRW_OPCODE_SHR:
302      case BRW_OPCODE_SHL:
303      case BRW_OPCODE_DIM:
304      case BRW_OPCODE_ASR:
305      case BRW_OPCODE_CMPN:
306      case BRW_OPCODE_F16TO32:
307      case BRW_OPCODE_BFREV:
308      case BRW_OPCODE_BFI1:
309      case BRW_OPCODE_AVG:
310      case BRW_OPCODE_FRC:
311      case BRW_OPCODE_RNDU:
312      case BRW_OPCODE_RNDD:
313      case BRW_OPCODE_RNDE:
314      case BRW_OPCODE_RNDZ:
315      case BRW_OPCODE_MAC:
316      case BRW_OPCODE_MACH:
317      case BRW_OPCODE_LZD:
318      case BRW_OPCODE_FBH:
319      case BRW_OPCODE_FBL:
320      case BRW_OPCODE_CBIT:
321      case BRW_OPCODE_ADDC:
322      case BRW_OPCODE_ROR:
323      case BRW_OPCODE_ROL:
324      case BRW_OPCODE_SUBB:
325      case BRW_OPCODE_SAD2:
326      case BRW_OPCODE_SADA2:
327      case BRW_OPCODE_LINE:
328      case BRW_OPCODE_NOP:
329      case SHADER_OPCODE_CLUSTER_BROADCAST:
330      case SHADER_OPCODE_SCRATCH_HEADER:
331      case FS_OPCODE_DDX_COARSE:
332      case FS_OPCODE_DDX_FINE:
333      case FS_OPCODE_DDY_COARSE:
334      case FS_OPCODE_PIXEL_X:
335      case FS_OPCODE_PIXEL_Y:
336      case FS_OPCODE_SET_SAMPLE_ID:
337      case VEC4_OPCODE_MOV_BYTES:
338      case VEC4_OPCODE_UNPACK_UNIFORM:
339      case VEC4_OPCODE_DOUBLE_TO_F32:
340      case VEC4_OPCODE_DOUBLE_TO_D32:
341      case VEC4_OPCODE_DOUBLE_TO_U32:
342      case VEC4_OPCODE_TO_DOUBLE:
343      case VEC4_OPCODE_PICK_LOW_32BIT:
344      case VEC4_OPCODE_PICK_HIGH_32BIT:
345      case VEC4_OPCODE_SET_LOW_32BIT:
346      case VEC4_OPCODE_SET_HIGH_32BIT:
347      case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
348      case GS_OPCODE_SET_DWORD_2:
349      case GS_OPCODE_SET_WRITE_OFFSET:
350      case GS_OPCODE_SET_VERTEX_COUNT:
351      case GS_OPCODE_PREPARE_CHANNEL_MASKS:
352      case GS_OPCODE_SET_CHANNEL_MASKS:
353      case GS_OPCODE_GET_INSTANCE_ID:
354      case GS_OPCODE_SET_PRIMITIVE_ID:
355      case GS_OPCODE_SVB_SET_DST_INDEX:
356      case TCS_OPCODE_SRC0_010_IS_ZERO:
357      case TCS_OPCODE_GET_PRIMITIVE_ID:
358      case TES_OPCODE_GET_PRIMITIVE_ID:
359      case SHADER_OPCODE_GET_DSS_ID:
360         if (devinfo->ver >= 11) {
361            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
362                                  0, 10, 6 /* XXX */, 14, 0, 0);
363         } else if (devinfo->ver >= 8) {
364            if (type_sz(info.tx) > 4)
365               return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
366                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
367            else
368               return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
369                                     0, 8, 4, 12, 0, 0);
370         } else if (devinfo->is_haswell) {
371            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
372                                  0, 10, 6 /* XXX */, 16, 0, 0);
373         } else {
374            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
375                                  0, 12, 8 /* XXX */, 18, 0, 0);
376         }
377
378      case BRW_OPCODE_MOV:
379      case BRW_OPCODE_CMP:
380      case BRW_OPCODE_ADD:
381      case BRW_OPCODE_ADD3:
382      case BRW_OPCODE_MUL:
383      case SHADER_OPCODE_MOV_RELOC_IMM:
384      case VEC4_OPCODE_MOV_FOR_SCRATCH:
385         if (devinfo->ver >= 11) {
386            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
387                                  0, 10, 6, 14, 0, 0);
388         } else if (devinfo->ver >= 8) {
389            if (type_sz(info.tx) > 4)
390               return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
391                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
392            else
393               return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
394                                     0, 8, 4, 12, 0, 0);
395         } else if (devinfo->is_haswell) {
396            if (info.tx == BRW_REGISTER_TYPE_F)
397               return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
398                                     0, 12, 8 /* XXX */, 18, 0, 0);
399            else
400               return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
401                                     0, 10, 6 /* XXX */, 16, 0, 0);
402         } else if (devinfo->ver >= 7) {
403            if (info.tx == BRW_REGISTER_TYPE_F)
404               return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
405                                     0, 14, 10 /* XXX */, 20, 0, 0);
406            else
407               return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
408                                     0, 12, 8 /* XXX */, 18, 0, 0);
409         } else {
410            return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
411                                  0, 2 /* XXX */,
412                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
413                                  0, 0);
414         }
415
416      case BRW_OPCODE_BFE:
417      case BRW_OPCODE_BFI2:
418      case BRW_OPCODE_CSEL:
419         if (devinfo->ver >= 11)
420            return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
421                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
422         else if (devinfo->ver >= 8)
423            return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
424                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
425         else if (devinfo->is_haswell)
426            return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
427                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
428         else if (devinfo->ver >= 7)
429            return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
430                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
431         else
432            abort();
433
434      case BRW_OPCODE_MAD:
435         if (devinfo->ver >= 11) {
436            return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
437                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
438         } else if (devinfo->ver >= 8) {
439            if (type_sz(info.tx) > 4)
440               return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
441                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
442            else
443               return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
444                                     0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
445         } else if (devinfo->is_haswell) {
446            if (info.tx == BRW_REGISTER_TYPE_F)
447               return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
448                                     0, 12, 8 /* XXX */, 18, 0, 0);
449            else
450               return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
451                                     0, 10, 6 /* XXX */, 16, 0, 0);
452         } else if (devinfo->ver >= 7) {
453            if (info.tx == BRW_REGISTER_TYPE_F)
454               return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
455                                     0, 14, 10 /* XXX */, 20, 0, 0);
456            else
457               return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
458                                     0, 12, 8 /* XXX */, 18, 0, 0);
459         } else if (devinfo->ver >= 6) {
460            return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */,
461                                  0, 2 /* XXX */,
462                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
463                                  0, 0);
464         } else {
465            abort();
466         }
467
468      case BRW_OPCODE_F32TO16:
469         if (devinfo->ver >= 11)
470            return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
471                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
472         else if (devinfo->ver >= 8)
473            return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
474                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
475         else if (devinfo->is_haswell)
476            return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
477                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
478         else if (devinfo->ver >= 7)
479            return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
480                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
481         else
482            abort();
483
484      case BRW_OPCODE_DP4:
485      case BRW_OPCODE_DPH:
486      case BRW_OPCODE_DP3:
487      case BRW_OPCODE_DP2:
488         if (devinfo->ver >= 8)
489            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
490                                  0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
491         else if (devinfo->is_haswell)
492            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
493                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
494         else
495            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
496                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
497
498      case BRW_OPCODE_DP4A:
499         if (devinfo->ver >= 12)
500            return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
501                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
502         else
503            abort();
504
505      case SHADER_OPCODE_RCP:
506      case SHADER_OPCODE_RSQ:
507      case SHADER_OPCODE_SQRT:
508      case SHADER_OPCODE_EXP2:
509      case SHADER_OPCODE_LOG2:
510      case SHADER_OPCODE_SIN:
511      case SHADER_OPCODE_COS:
512      case SHADER_OPCODE_POW:
513      case SHADER_OPCODE_INT_QUOTIENT:
514      case SHADER_OPCODE_INT_REMAINDER:
515         if (devinfo->ver >= 6) {
516            switch (info.op) {
517            case SHADER_OPCODE_RCP:
518            case SHADER_OPCODE_RSQ:
519            case SHADER_OPCODE_SQRT:
520            case SHADER_OPCODE_EXP2:
521            case SHADER_OPCODE_LOG2:
522            case SHADER_OPCODE_SIN:
523            case SHADER_OPCODE_COS:
524               if (devinfo->ver >= 8)
525                  return calculate_desc(info, unit_em, -2, 4, 0, 0, 4,
526                                        0, 16, 0, 0, 0, 0);
527               else if (devinfo->is_haswell)
528                  return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
529                                        0, 12, 0, 0, 0, 0);
530               else
531                  return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
532                                        0, 14, 0, 0, 0, 0);
533
534            case SHADER_OPCODE_POW:
535               if (devinfo->ver >= 8)
536                  return calculate_desc(info, unit_em, -2, 4, 0, 0, 8,
537                                        0, 24, 0, 0, 0, 0);
538               else if (devinfo->is_haswell)
539                  return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
540                                        0, 20, 0, 0, 0, 0);
541               else
542                  return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
543                                        0, 22, 0, 0, 0, 0);
544
545            case SHADER_OPCODE_INT_QUOTIENT:
546            case SHADER_OPCODE_INT_REMAINDER:
547               return calculate_desc(info, unit_em, 2, 0, 0, 26, 0,
548                                     0, 28 /* XXX */, 0, 0, 0, 0);
549
550            default:
551               abort();
552            }
553         } else {
554            switch (info.op) {
555            case SHADER_OPCODE_RCP:
556               return calculate_desc(info, unit_em, 2, 0, 0, 0, 8,
557                                     0, 22, 0, 0, 0, 8);
558
559            case SHADER_OPCODE_RSQ:
560               return calculate_desc(info, unit_em, 2, 0, 0, 0, 16,
561                                     0, 44, 0, 0, 0, 8);
562
563            case SHADER_OPCODE_INT_QUOTIENT:
564            case SHADER_OPCODE_SQRT:
565            case SHADER_OPCODE_LOG2:
566               return calculate_desc(info, unit_em, 2, 0, 0, 0, 24,
567                                     0, 66, 0, 0, 0, 8);
568
569            case SHADER_OPCODE_INT_REMAINDER:
570            case SHADER_OPCODE_EXP2:
571               return calculate_desc(info, unit_em, 2, 0, 0, 0, 32,
572                                     0, 88, 0, 0, 0, 8);
573
574            case SHADER_OPCODE_SIN:
575            case SHADER_OPCODE_COS:
576               return calculate_desc(info, unit_em, 2, 0, 0, 0, 48,
577                                     0, 132, 0, 0, 0, 8);
578
579            case SHADER_OPCODE_POW:
580               return calculate_desc(info, unit_em, 2, 0, 0, 0, 64,
581                                     0, 176, 0, 0, 0, 8);
582
583            default:
584               abort();
585            }
586         }
587
588      case BRW_OPCODE_DO:
589         if (devinfo->ver >= 6)
590            return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
591                                  0, 0, 0, 0, 0, 0);
592         else
593            return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0,
594                                  0, 0, 0, 0, 0, 0);
595
596      case BRW_OPCODE_IF:
597      case BRW_OPCODE_ELSE:
598      case BRW_OPCODE_ENDIF:
599      case BRW_OPCODE_WHILE:
600      case BRW_OPCODE_BREAK:
601      case BRW_OPCODE_CONTINUE:
602      case BRW_OPCODE_HALT:
603         if (devinfo->ver >= 8)
604            return calculate_desc(info, unit_null, 8, 0, 0, 0, 0,
605                                  0, 0, 0, 0, 0, 0);
606         else if (devinfo->is_haswell)
607            return calculate_desc(info, unit_null, 6, 0, 0, 0, 0,
608                                  0, 0, 0, 0, 0, 0);
609         else
610            return calculate_desc(info, unit_null, 2, 0, 0, 0, 0,
611                                  0, 0, 0, 0, 0, 0);
612
613      case FS_OPCODE_LINTERP:
614         if (devinfo->ver >= 8)
615            return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
616                                  0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
617         else if (devinfo->is_haswell)
618            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
619                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
620         else
621            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
622                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
623
624      case BRW_OPCODE_LRP:
625         if (devinfo->ver >= 8)
626            return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
627                                  0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
628         else if (devinfo->is_haswell)
629            return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
630                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
631         else if (devinfo->ver >= 6)
632            return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
633                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
634         else
635            abort();
636
637      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
638         if (devinfo->ver >= 11)
639            return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
640                                  0, 10 /* XXX */, 6 /* XXX */,
641                                  14 /* XXX */, 0, 0);
642         else if (devinfo->ver >= 8)
643            return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6,
644                                  0, 8 /* XXX */, 4 /* XXX */,
645                                  12 /* XXX */, 0, 0);
646         else if (devinfo->is_haswell)
647            return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
648                                  0, 10 /* XXX */, 6 /* XXX */,
649                                  16 /* XXX */, 0, 0);
650         else if (devinfo->ver >= 7)
651            return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6,
652                                  0, 12 /* XXX */, 8 /* XXX */,
653                                  18 /* XXX */, 0, 0);
654         else
655            abort();
656
657      case SHADER_OPCODE_MOV_INDIRECT:
658         if (devinfo->ver >= 11)
659            return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
660                                  0, 10 /* XXX */, 6 /* XXX */,
661                                  14 /* XXX */, 0, 0);
662         else if (devinfo->ver >= 8)
663            return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
664                                  0, 8 /* XXX */, 4 /* XXX */,
665                                  12 /* XXX */, 0, 0);
666         else if (devinfo->is_haswell)
667            return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
668                                  0, 10 /* XXX */, 6 /* XXX */,
669                                  16 /* XXX */, 0, 0);
670         else
671            return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
672                                  0, 12 /* XXX */, 8 /* XXX */,
673                                  18 /* XXX */, 0, 0);
674
675      case SHADER_OPCODE_BROADCAST:
676         if (devinfo->ver >= 11)
677            return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0,
678                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
679         else if (devinfo->ver >= 8)
680            return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
681                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
682         else if (devinfo->is_haswell)
683            return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
684                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
685         else if (devinfo->ver >= 7)
686            return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0,
687                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
688         else
689            abort();
690
691      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
692         if (devinfo->ver >= 11)
693            return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
694                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
695         else if (devinfo->ver >= 8)
696            return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
697                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
698         else if (devinfo->is_haswell)
699            return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0,
700                                  0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
701         else if (devinfo->ver >= 7)
702            return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0,
703                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
704         else
705            abort();
706
707      case SHADER_OPCODE_RND_MODE:
708      case SHADER_OPCODE_FLOAT_CONTROL_MODE:
709         if (devinfo->ver >= 11)
710            return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
711                                  4 /* XXX */, 0,
712                                  0, 0, 0, 0, 0, 0);
713         else if (devinfo->ver >= 8)
714            return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0,
715                                  4 /* XXX */, 0,
716                                  0, 0, 0, 0, 0, 0);
717         else if (devinfo->is_haswell)
718            return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
719                                  4 /* XXX */, 0,
720                                  0, 0, 0, 0, 0, 0);
721         else if (devinfo->ver >= 6)
722            return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0,
723                                  4 /* XXX */, 0,
724                                  0, 0, 0, 0, 0, 0);
725         else
726            abort();
727
728      case SHADER_OPCODE_SHUFFLE:
729         if (devinfo->ver >= 11)
730            return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
731                                  44 /* XXX */, 0,
732                                  0, 10 /* XXX */, 6 /* XXX */,
733                                  14 /* XXX */, 0, 0);
734         else if (devinfo->ver >= 8)
735            return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0,
736                                  42 /* XXX */, 0,
737                                  0, 8 /* XXX */, 4 /* XXX */,
738                                  12 /* XXX */, 0, 0);
739         else if (devinfo->is_haswell)
740            return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0,
741                                  0, 44 /* XXX */,
742                                  0, 10 /* XXX */, 6 /* XXX */,
743                                  16 /* XXX */, 0, 0);
744         else if (devinfo->ver >= 6)
745            return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0,
746                                  0, 46 /* XXX */,
747                                  0, 12 /* XXX */, 8 /* XXX */,
748                                  18 /* XXX */, 0, 0);
749         else
750            abort();
751
752      case SHADER_OPCODE_SEL_EXEC:
753         if (devinfo->ver >= 11)
754            return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
755                                  0, 4 /* XXX */,
756                                  0, 10 /* XXX */, 6 /* XXX */,
757                                  14 /* XXX */, 0, 0);
758         else if (devinfo->ver >= 8)
759            return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0,
760                                  0, 4 /* XXX */,
761                                  0, 8 /* XXX */, 4 /* XXX */,
762                                  12 /* XXX */, 0, 0);
763         else if (devinfo->is_haswell)
764            return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
765                                  0, 4 /* XXX */,
766                                  0, 10 /* XXX */, 6 /* XXX */,
767                                  16 /* XXX */, 0, 0);
768         else
769            return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0,
770                                  0, 4 /* XXX */,
771                                  0, 12 /* XXX */, 8 /* XXX */,
772                                  18 /* XXX */, 0, 0);
773
774      case SHADER_OPCODE_QUAD_SWIZZLE:
775         if (devinfo->ver >= 11)
776            return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
777                                  0, 8 /* XXX */,
778                                  0, 10 /* XXX */, 6 /* XXX */,
779                                  14 /* XXX */, 0, 0);
780         else if (devinfo->ver >= 8)
781            return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
782                                  0, 8 /* XXX */,
783                                  0, 8 /* XXX */, 4 /* XXX */,
784                                  12 /* XXX */, 0, 0);
785         else if (devinfo->is_haswell)
786            return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
787                                  0, 8 /* XXX */,
788                                  0, 10 /* XXX */, 6 /* XXX */,
789                                  16 /* XXX */, 0, 0);
790         else
791            return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
792                                  0, 8 /* XXX */,
793                                  0, 12 /* XXX */, 8 /* XXX */,
794                                  18 /* XXX */, 0, 0);
795
796      case FS_OPCODE_DDY_FINE:
797         if (devinfo->ver >= 11)
798            return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4,
799                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
800         else if (devinfo->ver >= 8)
801            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
802                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
803         else if (devinfo->is_haswell)
804            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
805                                  0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
806         else
807            return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
808                                  0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
809
810      case FS_OPCODE_LOAD_LIVE_CHANNELS:
811         if (devinfo->ver >= 11)
812            return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0,
813                                  2 /* XXX */, 0,
814                                  0, 0, 0, 10 /* XXX */, 0, 0);
815         else if (devinfo->ver >= 8)
816            return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
817                                  0, 2 /* XXX */,
818                                  0, 0, 0, 8 /* XXX */, 0, 0);
819         else
820            abort();
821
822      case VEC4_OPCODE_PACK_BYTES:
823         if (devinfo->ver >= 8)
824            return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
825                                  4 /* XXX */, 0,
826                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
827                                  0, 0);
828         else if (devinfo->is_haswell)
829            return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
830                                  4 /* XXX */, 0,
831                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
832                                  0, 0);
833         else
834            return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
835                                  4 /* XXX */, 0,
836                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
837                                  0, 0);
838
839      case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
840      case TCS_OPCODE_GET_INSTANCE_ID:
841      case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
842      case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
843      case TES_OPCODE_CREATE_INPUT_READ_HEADER:
844         if (devinfo->ver >= 8)
845            return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0,
846                                  6 /* XXX */, 0,
847                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
848                                  0, 0);
849         else if (devinfo->is_haswell)
850            return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0,
851                                  6 /* XXX */, 0,
852                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
853                                  0, 0);
854         else
855            return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0,
856                                  6 /* XXX */, 0,
857                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
858                                  0, 0);
859
860      case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
861      case TCS_OPCODE_CREATE_BARRIER_HEADER:
862         if (devinfo->ver >= 8)
863            return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0,
864                                  8 /* XXX */, 0,
865                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
866                                  0, 0);
867         else if (devinfo->is_haswell)
868            return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0,
869                                  8 /* XXX */, 0,
870                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
871                                  0, 0);
872         else if (devinfo->ver >= 6)
873            return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
874                                  8 /* XXX */, 0,
875                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
876                                  0, 0);
877         else
878            abort();
879
880      case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
881         if (devinfo->ver >= 8)
882            return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
883                                  4 /* XXX */, 0,
884                                  0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
885                                  0, 0);
886         else if (devinfo->is_haswell)
887            return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0,
888                                  4 /* XXX */, 0,
889                                  0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
890                                  0, 0);
891         else if (devinfo->ver >= 7)
892            return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0,
893                                  4 /* XXX */, 0,
894                                  0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
895                                  0, 0);
896         else
897            abort();
898
899      case SHADER_OPCODE_TEX:
900      case FS_OPCODE_TXB:
901      case SHADER_OPCODE_TXD:
902      case SHADER_OPCODE_TXF:
903      case SHADER_OPCODE_TXF_LZ:
904      case SHADER_OPCODE_TXL:
905      case SHADER_OPCODE_TXL_LZ:
906      case SHADER_OPCODE_TXF_CMS:
907      case SHADER_OPCODE_TXF_CMS_W:
908      case SHADER_OPCODE_TXF_UMS:
909      case SHADER_OPCODE_TXF_MCS:
910      case SHADER_OPCODE_TXS:
911      case SHADER_OPCODE_LOD:
912      case SHADER_OPCODE_GET_BUFFER_SIZE:
913      case SHADER_OPCODE_TG4:
914      case SHADER_OPCODE_TG4_OFFSET:
915      case SHADER_OPCODE_SAMPLEINFO:
916      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
917         return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */,
918                               8 /* XXX */, 750 /* XXX */, 0, 0,
919                               2 /* XXX */, 0);
920
921      case SHADER_OPCODE_URB_READ_SIMD8:
922      case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
923      case SHADER_OPCODE_URB_WRITE_SIMD8:
924      case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
925      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
926      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
927      case VEC4_OPCODE_URB_READ:
928      case VS_OPCODE_URB_WRITE:
929      case GS_OPCODE_URB_WRITE:
930      case GS_OPCODE_URB_WRITE_ALLOCATE:
931      case GS_OPCODE_THREAD_END:
932      case GS_OPCODE_FF_SYNC:
933      case TCS_OPCODE_URB_WRITE:
934      case TCS_OPCODE_RELEASE_INPUT:
935      case TCS_OPCODE_THREAD_END:
936         return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */,
937                               32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
938
939      case SHADER_OPCODE_MEMORY_FENCE:
940      case SHADER_OPCODE_INTERLOCK:
941         switch (info.sfid) {
942         case GFX6_SFID_DATAPORT_RENDER_CACHE:
943            if (devinfo->ver >= 7)
944               return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0,
945                                     10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
946            else
947               abort();
948
949         case BRW_SFID_URB:
950         case GFX7_SFID_DATAPORT_DATA_CACHE:
951         case GFX12_SFID_SLM:
952         case GFX12_SFID_TGM:
953         case GFX12_SFID_UGM:
954         case HSW_SFID_DATAPORT_DATA_CACHE_1:
955            if (devinfo->ver >= 7)
956               return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
957                                     10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
958            else
959               abort();
960
961         default:
962            abort();
963         }
964
965      case SHADER_OPCODE_GFX4_SCRATCH_READ:
966      case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
967      case SHADER_OPCODE_GFX7_SCRATCH_READ:
968         return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */,
969                               10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
970
971      case VEC4_OPCODE_UNTYPED_ATOMIC:
972         if (devinfo->ver >= 7)
973            return calculate_desc(info, unit_dp_dc, 2, 0, 0,
974                                  30 /* XXX */, 400 /* XXX */,
975                                  10 /* XXX */, 100 /* XXX */, 0, 0,
976                                  0, 400 /* XXX */);
977         else
978            abort();
979
980      case VEC4_OPCODE_UNTYPED_SURFACE_READ:
981      case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
982         if (devinfo->ver >= 7)
983            return calculate_desc(info, unit_dp_dc, 2, 0, 0,
984                                  0, 20 /* XXX */,
985                                  10 /* XXX */, 100 /* XXX */, 0, 0,
986                                  0, 0);
987         else
988            abort();
989
990      case FS_OPCODE_FB_WRITE:
991      case FS_OPCODE_FB_READ:
992      case FS_OPCODE_REP_FB_WRITE:
993         return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */,
994                               10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
995
996      case GS_OPCODE_SVB_WRITE:
997         if (devinfo->ver >= 6)
998            return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
999                                  0, 450 /* XXX */,
1000                                  10 /* XXX */, 300 /* XXX */, 0, 0,
1001                                  0, 0);
1002         else
1003            abort();
1004
1005      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1006      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
1007         return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */,
1008                               10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1009
1010      case VS_OPCODE_PULL_CONSTANT_LOAD:
1011      case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1012         return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1013                               8, 750, 0, 0, 2, 0);
1014
1015      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1016      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1017      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1018         if (devinfo->ver >= 7)
1019            return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0,
1020                                  0, 90 /* XXX */, 0, 0, 0, 0);
1021         else
1022            abort();
1023
1024      case SHADER_OPCODE_BARRIER:
1025         if (devinfo->ver >= 7)
1026            return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0,
1027                                  0 /* XXX */, 0,
1028                                  0, 0, 0, 0, 0, 0);
1029         else
1030            abort();
1031
1032      case CS_OPCODE_CS_TERMINATE:
1033         if (devinfo->ver >= 7)
1034            return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1035                                  10 /* XXX */, 0, 0, 0, 0, 0);
1036         else
1037            abort();
1038
1039      case SHADER_OPCODE_SEND:
1040         switch (info.sfid) {
1041         case GFX6_SFID_DATAPORT_RENDER_CACHE:
1042            if (devinfo->ver >= 7) {
1043               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1044               case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
1045                  return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1046                                        30 /* XXX */, 450 /* XXX */,
1047                                        10 /* XXX */, 100 /* XXX */,
1048                                        0, 0, 0, 400 /* XXX */);
1049               default:
1050                  return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1051                                        0, 450 /* XXX */,
1052                                        10 /* XXX */, 300 /* XXX */, 0, 0,
1053                                        0, 0);
1054               }
1055            } else if (devinfo->ver >= 6)  {
1056               return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
1057                                     0, 450 /* XXX */,
1058                                     10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1059            } else {
1060               abort();
1061            }
1062         case BRW_SFID_SAMPLER: {
1063            if (devinfo->ver >= 6)
1064               return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1065                                     8, 750, 0, 0, 2, 0);
1066            else
1067               abort();
1068         }
1069         case GFX7_SFID_DATAPORT_DATA_CACHE:
1070         case HSW_SFID_DATAPORT_DATA_CACHE_1:
1071            if (devinfo->verx10 >= 75) {
1072               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1073               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1074               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1075               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1076               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1077                  return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1078                                        30 /* XXX */, 400 /* XXX */,
1079                                        10 /* XXX */, 100 /* XXX */, 0, 0,
1080                                        0, 400 /* XXX */);
1081
1082               default:
1083                  return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1084                                        0, 20 /* XXX */,
1085                                        10 /* XXX */, 100 /* XXX */, 0, 0,
1086                                        0, 0);
1087               }
1088            } else if (devinfo->ver >= 7) {
1089               switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1090               case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1091                  return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1092                                        30 /* XXX */, 400 /* XXX */,
1093                                        10 /* XXX */, 100 /* XXX */,
1094                                        0, 0, 0, 400 /* XXX */);
1095               default:
1096                  return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1097                                        0, 20 /* XXX */,
1098                                        10 /* XXX */, 100 /* XXX */, 0, 0,
1099                                        0, 0);
1100               }
1101            } else {
1102               abort();
1103            }
1104
1105         case GFX12_SFID_UGM:
1106         case GFX12_SFID_TGM:
1107         case GFX12_SFID_SLM:
1108            switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
1109            case LSC_OP_LOAD:
1110            case LSC_OP_STORE:
1111            case LSC_OP_LOAD_CMASK:
1112            case LSC_OP_STORE_CMASK:
1113               return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1114                                     0, 20 /* XXX */,
1115                                     10 /* XXX */, 100 /* XXX */, 0, 0,
1116                                     0, 0);
1117
1118            case LSC_OP_FENCE:
1119            case LSC_OP_ATOMIC_INC:
1120            case LSC_OP_ATOMIC_DEC:
1121            case LSC_OP_ATOMIC_LOAD:
1122            case LSC_OP_ATOMIC_STORE:
1123            case LSC_OP_ATOMIC_ADD:
1124            case LSC_OP_ATOMIC_SUB:
1125            case LSC_OP_ATOMIC_MIN:
1126            case LSC_OP_ATOMIC_MAX:
1127            case LSC_OP_ATOMIC_UMIN:
1128            case LSC_OP_ATOMIC_UMAX:
1129            case LSC_OP_ATOMIC_CMPXCHG:
1130            case LSC_OP_ATOMIC_FADD:
1131            case LSC_OP_ATOMIC_FSUB:
1132            case LSC_OP_ATOMIC_FMIN:
1133            case LSC_OP_ATOMIC_FMAX:
1134            case LSC_OP_ATOMIC_FCMPXCHG:
1135            case LSC_OP_ATOMIC_AND:
1136            case LSC_OP_ATOMIC_OR:
1137            case LSC_OP_ATOMIC_XOR:
1138               return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1139                                     30 /* XXX */, 400 /* XXX */,
1140                                     10 /* XXX */, 100 /* XXX */, 0, 0,
1141                                     0, 400 /* XXX */);
1142            default:
1143               abort();
1144            }
1145
1146         case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
1147         case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
1148            return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1149                                  10 /* XXX */, 0, 0, 0, 0, 0);
1150
1151         default:
1152            abort();
1153         }
1154
1155      case SHADER_OPCODE_UNDEF:
1156      case SHADER_OPCODE_HALT_TARGET:
1157      case FS_OPCODE_SCHEDULING_FENCE:
1158         return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
1159                               0, 0, 0, 0, 0, 0);
1160
1161      default:
1162         abort();
1163      }
1164   }
1165
1166   /**
1167    * Model the performance behavior of a stall on the specified dependency
1168    * ID.
1169    */
1170   void
1171   stall_on_dependency(state &st, dependency_id id)
1172   {
1173      if (id < ARRAY_SIZE(st.dep_ready))
1174         st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1175                                       st.dep_ready[id]);
1176   }
1177
1178   /**
1179    * Model the performance behavior of the front-end and back-end while
1180    * executing an instruction with the specified timing information, assuming
1181    * all dependencies are already clear.
1182    */
1183   void
1184   execute_instruction(state &st, const perf_desc &perf)
1185   {
1186      /* Compute the time at which the front-end will be ready to execute the
1187       * next instruction.
1188       */
1189      st.unit_ready[unit_fe] += perf.df;
1190
1191      if (perf.u < num_units) {
1192         /* Wait for the back-end to be ready to execute this instruction. */
1193         st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1194                                       st.unit_ready[perf.u]);
1195
1196         /* Compute the time at which the back-end will be ready to execute
1197          * the next instruction, and update the back-end utilization.
1198          */
1199         st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db;
1200         st.unit_busy[perf.u] += perf.db * st.weight;
1201      }
1202   }
1203
1204   /**
1205    * Model the performance behavior of a read dependency provided by an
1206    * instruction.
1207    */
1208   void
1209   mark_read_dependency(state &st, const perf_desc &perf, dependency_id id)
1210   {
1211      if (id < ARRAY_SIZE(st.dep_ready))
1212         st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls;
1213   }
1214
1215   /**
1216    * Model the performance behavior of a write dependency provided by an
1217    * instruction.
1218    */
1219   void
1220   mark_write_dependency(state &st, const perf_desc &perf, dependency_id id)
1221   {
1222      if (id >= dependency_id_accum0 && id < dependency_id_flag0)
1223         st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la;
1224      else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0)
1225         st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf;
1226      else if (id < ARRAY_SIZE(st.dep_ready))
1227         st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld;
1228   }
1229
1230   /**
1231    * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1232    */
1233   dependency_id
1234   reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
1235                     const int delta)
1236   {
1237      if (r.file == VGRF) {
1238         const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1239         assert(i < dependency_id_mrf0 - dependency_id_grf0);
1240         return dependency_id(dependency_id_grf0 + i);
1241
1242      } else if (r.file == FIXED_GRF) {
1243         const unsigned i = r.nr + delta;
1244         assert(i < dependency_id_mrf0 - dependency_id_grf0);
1245         return dependency_id(dependency_id_grf0 + i);
1246
1247      } else if (r.file == MRF && devinfo->ver >= 7) {
1248         const unsigned i = GFX7_MRF_HACK_START +
1249                            r.nr + r.offset / REG_SIZE + delta;
1250         assert(i < dependency_id_mrf0 - dependency_id_grf0);
1251         return dependency_id(dependency_id_grf0 + i);
1252
1253      } else if (r.file == MRF && devinfo->ver < 7) {
1254         const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1255                            r.offset / REG_SIZE + delta;
1256         assert(i < dependency_id_addr0 - dependency_id_mrf0);
1257         return dependency_id(dependency_id_mrf0 + i);
1258
1259      } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1260                 r.nr < BRW_ARF_ACCUMULATOR) {
1261         assert(delta == 0);
1262         return dependency_id_addr0;
1263
1264      } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1265                 r.nr < BRW_ARF_FLAG) {
1266         const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1267         assert(i < dependency_id_flag0 - dependency_id_accum0);
1268         return dependency_id(dependency_id_accum0 + i);
1269
1270      } else {
1271         return num_dependency_ids;
1272      }
1273   }
1274
1275   /**
1276    * Return the dependency ID of flag register starting at offset \p i.
1277    */
1278   dependency_id
1279   flag_dependency_id(unsigned i)
1280   {
1281      assert(i < dependency_id_sbid_wr0 - dependency_id_flag0);
1282      return dependency_id(dependency_id_flag0 + i);
1283   }
1284
1285   /**
1286    * Return the dependency ID corresponding to the SBID read completion
1287    * condition of a Gfx12+ SWSB.
1288    */
1289   dependency_id
1290   tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1291   {
1292      if (swsb.mode) {
1293         assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0);
1294         return dependency_id(dependency_id_sbid_rd0 + swsb.sbid);
1295      } else {
1296         return num_dependency_ids;
1297      }
1298   }
1299
1300   /**
1301    * Return the dependency ID corresponding to the SBID write completion
1302    * condition of a Gfx12+ SWSB.
1303    */
1304   dependency_id
1305   tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1306   {
1307      if (swsb.mode) {
1308         assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0);
1309         return dependency_id(dependency_id_sbid_wr0 + swsb.sbid);
1310      } else {
1311         return num_dependency_ids;
1312      }
1313   }
1314
1315   /**
1316    * Return the implicit accumulator register accessed by channel \p i of the
1317    * instruction.
1318    */
1319   unsigned
1320   accum_reg_of_channel(const intel_device_info *devinfo,
1321                        const backend_instruction *inst,
1322                        brw_reg_type tx, unsigned i)
1323   {
1324      assert(inst->reads_accumulator_implicitly() ||
1325             inst->writes_accumulator_implicitly(devinfo));
1326      const unsigned offset = (inst->group + i) * type_sz(tx) *
1327         (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1328      return offset / REG_SIZE % 2;
1329   }
1330
1331   /**
1332    * Model the performance behavior of an FS back-end instruction.
1333    */
1334   void
1335   issue_fs_inst(state &st, const intel_device_info *devinfo,
1336                 const backend_instruction *be_inst)
1337   {
1338      const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1339      const instruction_info info(devinfo, inst);
1340      const perf_desc perf = instruction_desc(info);
1341
1342      /* Stall on any source dependencies. */
1343      for (unsigned i = 0; i < inst->sources; i++) {
1344         for (unsigned j = 0; j < regs_read(inst, i); j++)
1345            stall_on_dependency(
1346               st, reg_dependency_id(devinfo, inst->src[i], j));
1347      }
1348
1349      if (inst->reads_accumulator_implicitly()) {
1350         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1351              j <= accum_reg_of_channel(devinfo, inst, info.tx,
1352                                        inst->exec_size - 1); j++)
1353            stall_on_dependency(
1354               st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1355      }
1356
1357      if (is_send(inst) && inst->base_mrf != -1) {
1358         for (unsigned j = 0; j < inst->mlen; j++)
1359            stall_on_dependency(
1360               st, reg_dependency_id(
1361                  devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1362      }
1363
1364      if (const unsigned mask = inst->flags_read(devinfo)) {
1365         for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1366            if (mask & (1 << i))
1367               stall_on_dependency(st, flag_dependency_id(i));
1368         }
1369      }
1370
1371      /* Stall on any write dependencies. */
1372      if (!inst->no_dd_check) {
1373         if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1374            for (unsigned j = 0; j < regs_written(inst); j++)
1375               stall_on_dependency(
1376                  st, reg_dependency_id(devinfo, inst->dst, j));
1377         }
1378
1379         if (inst->writes_accumulator_implicitly(devinfo)) {
1380            for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1381                 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1382                                           inst->exec_size - 1); j++)
1383               stall_on_dependency(
1384                  st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1385         }
1386
1387         if (const unsigned mask = inst->flags_written(devinfo)) {
1388            for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1389               if (mask & (1 << i))
1390                  stall_on_dependency(st, flag_dependency_id(i));
1391            }
1392         }
1393      }
1394
1395      /* Stall on any SBID dependencies. */
1396      if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1397         stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1398      else if (inst->sched.mode & TGL_SBID_SRC)
1399         stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1400
1401      /* Execute the instruction. */
1402      execute_instruction(st, perf);
1403
1404      /* Mark any source dependencies. */
1405      if (inst->is_send_from_grf()) {
1406         for (unsigned i = 0; i < inst->sources; i++) {
1407            if (inst->is_payload(i)) {
1408               for (unsigned j = 0; j < regs_read(inst, i); j++)
1409                  mark_read_dependency(
1410                     st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1411            }
1412         }
1413      }
1414
1415      if (is_send(inst) && inst->base_mrf != -1) {
1416         for (unsigned j = 0; j < inst->mlen; j++)
1417            mark_read_dependency(st, perf,
1418               reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1419      }
1420
1421      /* Mark any destination dependencies. */
1422      if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1423         for (unsigned j = 0; j < regs_written(inst); j++) {
1424            mark_write_dependency(st, perf,
1425                                  reg_dependency_id(devinfo, inst->dst, j));
1426         }
1427      }
1428
1429      if (inst->writes_accumulator_implicitly(devinfo)) {
1430         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1431              j <= accum_reg_of_channel(devinfo, inst, info.tx,
1432                                        inst->exec_size - 1); j++)
1433            mark_write_dependency(st, perf,
1434                                  reg_dependency_id(devinfo, brw_acc_reg(8), j));
1435      }
1436
1437      if (const unsigned mask = inst->flags_written(devinfo)) {
1438         for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1439            if (mask & (1 << i))
1440               mark_write_dependency(st, perf, flag_dependency_id(i));
1441         }
1442      }
1443
1444      /* Mark any SBID dependencies. */
1445      if (inst->sched.mode & TGL_SBID_SET) {
1446         mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1447         mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1448      }
1449   }
1450
1451   /**
1452    * Model the performance behavior of a VEC4 back-end instruction.
1453    */
1454   void
1455   issue_vec4_instruction(state &st, const intel_device_info *devinfo,
1456                          const backend_instruction *be_inst)
1457   {
1458      const vec4_instruction *inst =
1459         static_cast<const vec4_instruction *>(be_inst);
1460      const instruction_info info(devinfo, inst);
1461      const perf_desc perf = instruction_desc(info);
1462
1463      /* Stall on any source dependencies. */
1464      for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1465         for (unsigned j = 0; j < regs_read(inst, i); j++)
1466            stall_on_dependency(
1467               st, reg_dependency_id(devinfo, inst->src[i], j));
1468      }
1469
1470      if (inst->reads_accumulator_implicitly()) {
1471         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1472              j <= accum_reg_of_channel(devinfo, inst, info.tx,
1473                                        inst->exec_size - 1); j++)
1474            stall_on_dependency(
1475               st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1476      }
1477
1478      if (inst->base_mrf != -1) {
1479         for (unsigned j = 0; j < inst->mlen; j++)
1480            stall_on_dependency(
1481               st, reg_dependency_id(
1482                  devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1483      }
1484
1485      if (inst->reads_flag())
1486         stall_on_dependency(st, dependency_id_flag0);
1487
1488      /* Stall on any write dependencies. */
1489      if (!inst->no_dd_check) {
1490         if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1491            for (unsigned j = 0; j < regs_written(inst); j++)
1492               stall_on_dependency(
1493                  st, reg_dependency_id(devinfo, inst->dst, j));
1494         }
1495
1496         if (inst->writes_accumulator_implicitly(devinfo)) {
1497            for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1498                 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1499                                           inst->exec_size - 1); j++)
1500               stall_on_dependency(
1501                  st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1502         }
1503
1504         if (inst->writes_flag(devinfo))
1505            stall_on_dependency(st, dependency_id_flag0);
1506      }
1507
1508      /* Execute the instruction. */
1509      execute_instruction(st, perf);
1510
1511      /* Mark any source dependencies. */
1512      if (inst->is_send_from_grf()) {
1513         for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1514            for (unsigned j = 0; j < regs_read(inst, i); j++)
1515               mark_read_dependency(
1516                  st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1517         }
1518      }
1519
1520      if (inst->base_mrf != -1) {
1521         for (unsigned j = 0; j < inst->mlen; j++)
1522            mark_read_dependency(st, perf,
1523               reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1524      }
1525
1526      /* Mark any destination dependencies. */
1527      if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1528         for (unsigned j = 0; j < regs_written(inst); j++) {
1529            mark_write_dependency(st, perf,
1530                                  reg_dependency_id(devinfo, inst->dst, j));
1531         }
1532      }
1533
1534      if (inst->writes_accumulator_implicitly(devinfo)) {
1535         for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1536              j <= accum_reg_of_channel(devinfo, inst, info.tx,
1537                                        inst->exec_size - 1); j++)
1538            mark_write_dependency(st, perf,
1539                                  reg_dependency_id(devinfo, brw_acc_reg(8), j));
1540      }
1541
1542      if (inst->writes_flag(devinfo))
1543         mark_write_dependency(st, perf, dependency_id_flag0);
1544   }
1545
1546   /**
1547    * Calculate the maximum possible throughput of the program compatible with
1548    * the cycle-count utilization estimated for each asynchronous unit, in
1549    * threads-per-cycle units.
1550    */
1551   float
1552   calculate_thread_throughput(const state &st, float busy)
1553   {
1554      for (unsigned i = 0; i < num_units; i++)
1555         busy = MAX2(busy, st.unit_busy[i]);
1556
1557      return 1.0 / busy;
1558   }
1559
1560   /**
1561    * Estimate the performance of the specified shader.
1562    */
1563   void
1564   calculate_performance(performance &p, const backend_shader *s,
1565                         void (*issue_instruction)(
1566                            state &, const intel_device_info *,
1567                            const backend_instruction *),
1568                         unsigned dispatch_width)
1569   {
1570      /* XXX - Note that the previous version of this code used worst-case
1571       *       scenario estimation of branching divergence for SIMD32 shaders,
1572       *       but this heuristic was removed to improve performance in common
1573       *       scenarios. Wider shader variants are less optimal when divergence
1574       *       is high, e.g. when application renders complex scene on a small
1575       *       surface. It is assumed that such renders are short, so their
1576       *       time doesn't matter and when it comes to the overall performance,
1577       *       they are dominated by more optimal larger renders.
1578       *
1579       *       It's possible that we could do better with divergence analysis
1580       *       by isolating branches which are 100% uniform.
1581       *
1582       *       Plumbing the trip counts from NIR loop analysis would allow us
1583       *       to do a better job regarding the loop weights.
1584       *
1585       *       In the meantime use values that roughly match the control flow
1586       *       weights used elsewhere in the compiler back-end.
1587       *
1588       *       Note that we provide slightly more pessimistic weights on
1589       *       Gfx12+ for SIMD32, since the effective warp size on that
1590       *       platform is 2x the SIMD width due to EU fusion, which increases
1591       *       the likelihood of divergent control flow in comparison to
1592       *       previous generations, giving narrower SIMD modes a performance
1593       *       advantage in several test-cases with non-uniform discard jumps.
1594       */
1595      const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1596                                    1.0 : 0.5);
1597      const float loop_weight = 10;
1598      unsigned halt_count = 0;
1599      unsigned elapsed = 0;
1600      state st;
1601
1602      foreach_block(block, s->cfg) {
1603         const unsigned elapsed0 = elapsed;
1604
1605         foreach_inst_in_block(backend_instruction, inst, block) {
1606            const unsigned clock0 = st.unit_ready[unit_fe];
1607
1608            issue_instruction(st, s->devinfo, inst);
1609
1610            if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1611               st.weight /= discard_weight;
1612
1613            elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
1614
1615            if (inst->opcode == BRW_OPCODE_DO)
1616               st.weight *= loop_weight;
1617            else if (inst->opcode == BRW_OPCODE_WHILE)
1618               st.weight /= loop_weight;
1619            else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1620               st.weight *= discard_weight;
1621         }
1622
1623         p.block_latency[block->num] = elapsed - elapsed0;
1624      }
1625
1626      p.latency = elapsed;
1627      p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1628   }
1629}
1630
1631brw::performance::performance(const fs_visitor *v) :
1632   block_latency(new unsigned[v->cfg->num_blocks])
1633{
1634   calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1635}
1636
1637brw::performance::performance(const vec4_visitor *v) :
1638   block_latency(new unsigned[v->cfg->num_blocks])
1639{
1640   calculate_performance(*this, v, issue_vec4_instruction, 8);
1641}
1642
1643brw::performance::~performance()
1644{
1645   delete[] block_latency;
1646}
1647