1/*
2 * Copyright (c) 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25/** @file hsw_queryobj.c
26 *
27 * Support for query buffer objects (GL_ARB_query_buffer_object) on Haswell+.
28 */
29#include "brw_context.h"
30#include "brw_defines.h"
31#include "brw_batch.h"
32#include "brw_buffer_objects.h"
33
34/*
35 * GPR0 = 80 * GPR0;
36 */
37static void
38mult_gpr0_by_80(struct brw_context *brw)
39{
40   static const uint32_t maths[] = {
41      MI_MATH_ALU2(LOAD, SRCA, R0),
42      MI_MATH_ALU2(LOAD, SRCB, R0),
43      MI_MATH_ALU0(ADD),
44      MI_MATH_ALU2(STORE, R1, ACCU),
45      MI_MATH_ALU2(LOAD, SRCA, R1),
46      MI_MATH_ALU2(LOAD, SRCB, R1),
47      MI_MATH_ALU0(ADD),
48      MI_MATH_ALU2(STORE, R1, ACCU),
49      MI_MATH_ALU2(LOAD, SRCA, R1),
50      MI_MATH_ALU2(LOAD, SRCB, R1),
51      MI_MATH_ALU0(ADD),
52      MI_MATH_ALU2(STORE, R1, ACCU),
53      MI_MATH_ALU2(LOAD, SRCA, R1),
54      MI_MATH_ALU2(LOAD, SRCB, R1),
55      MI_MATH_ALU0(ADD),
56      /* GPR1 = 16 * GPR0 */
57      MI_MATH_ALU2(STORE, R1, ACCU),
58      MI_MATH_ALU2(LOAD, SRCA, R1),
59      MI_MATH_ALU2(LOAD, SRCB, R1),
60      MI_MATH_ALU0(ADD),
61      MI_MATH_ALU2(STORE, R2, ACCU),
62      MI_MATH_ALU2(LOAD, SRCA, R2),
63      MI_MATH_ALU2(LOAD, SRCB, R2),
64      MI_MATH_ALU0(ADD),
65      /* GPR2 = 64 * GPR0 */
66      MI_MATH_ALU2(STORE, R2, ACCU),
67      MI_MATH_ALU2(LOAD, SRCA, R1),
68      MI_MATH_ALU2(LOAD, SRCB, R2),
69      MI_MATH_ALU0(ADD),
70      /* GPR0 = 80 * GPR0 */
71      MI_MATH_ALU2(STORE, R0, ACCU),
72   };
73
74   BEGIN_BATCH(1 + ARRAY_SIZE(maths));
75   OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2));
76
77   for (int m = 0; m < ARRAY_SIZE(maths); m++)
78      OUT_BATCH(maths[m]);
79
80   ADVANCE_BATCH();
81}
82
83/*
84 * GPR0 = GPR0 & ((1ull << n) - 1);
85 */
86static void
87keep_gpr0_lower_n_bits(struct brw_context *brw, uint32_t n)
88{
89   static const uint32_t maths[] = {
90      MI_MATH_ALU2(LOAD, SRCA, R0),
91      MI_MATH_ALU2(LOAD, SRCB, R1),
92      MI_MATH_ALU0(AND),
93      MI_MATH_ALU2(STORE, R0, ACCU),
94   };
95
96   assert(n < 64);
97   brw_load_register_imm64(brw, HSW_CS_GPR(1), (1ull << n) - 1);
98
99   BEGIN_BATCH(1 + ARRAY_SIZE(maths));
100   OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2));
101
102   for (int m = 0; m < ARRAY_SIZE(maths); m++)
103      OUT_BATCH(maths[m]);
104
105   ADVANCE_BATCH();
106}
107
108/*
109 * GPR0 = GPR0 << 30;
110 */
111static void
112shl_gpr0_by_30_bits(struct brw_context *brw)
113{
114   /* First we mask 34 bits of GPR0 to prevent overflow */
115   keep_gpr0_lower_n_bits(brw, 34);
116
117   static const uint32_t shl_maths[] = {
118      MI_MATH_ALU2(LOAD, SRCA, R0),
119      MI_MATH_ALU2(LOAD, SRCB, R0),
120      MI_MATH_ALU0(ADD),
121      MI_MATH_ALU2(STORE, R0, ACCU),
122   };
123
124   const uint32_t outer_count = 5;
125   const uint32_t inner_count = 6;
126   STATIC_ASSERT(outer_count * inner_count == 30);
127   const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_maths);
128   const uint32_t batch_len = cmd_len * outer_count;
129
130   BEGIN_BATCH(batch_len);
131
132   /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
133    * 30 left shifts.
134    */
135   for (int o = 0; o < outer_count; o++) {
136      /* Submit one MI_MATH to shift left by 6 bits */
137      OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));
138      for (int i = 0; i < inner_count; i++)
139         for (int m = 0; m < ARRAY_SIZE(shl_maths); m++)
140            OUT_BATCH(shl_maths[m]);
141   }
142
143   ADVANCE_BATCH();
144}
145
146/*
147 * GPR0 = GPR0 >> 2;
148 *
149 * Note that the upper 30 bits of GPR0 are lost!
150 */
151static void
152shr_gpr0_by_2_bits(struct brw_context *brw)
153{
154   shl_gpr0_by_30_bits(brw);
155   brw_load_register_reg(brw, HSW_CS_GPR(0), HSW_CS_GPR(0) + 4);
156   brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0);
157}
158
159/*
160 * GPR0 = (GPR0 == 0) ? 0 : 1;
161 */
162static void
163gpr0_to_bool(struct brw_context *brw)
164{
165   static const uint32_t maths[] = {
166      MI_MATH_ALU2(LOAD, SRCA, R0),
167      MI_MATH_ALU1(LOAD0, SRCB),
168      MI_MATH_ALU0(ADD),
169      MI_MATH_ALU2(STOREINV, R0, ZF),
170      MI_MATH_ALU2(LOAD, SRCA, R0),
171      MI_MATH_ALU2(LOAD, SRCB, R1),
172      MI_MATH_ALU0(AND),
173      MI_MATH_ALU2(STORE, R0, ACCU),
174   };
175
176   brw_load_register_imm64(brw, HSW_CS_GPR(1), 1ull);
177
178   BEGIN_BATCH(1 + ARRAY_SIZE(maths));
179   OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2));
180
181   for (int m = 0; m < ARRAY_SIZE(maths); m++)
182      OUT_BATCH(maths[m]);
183
184   ADVANCE_BATCH();
185}
186
187static void
188load_overflow_data_to_cs_gprs(struct brw_context *brw,
189                              struct brw_query_object *query,
190                              int idx)
191{
192   int offset = idx * sizeof(uint64_t) * 4;
193
194   brw_load_register_mem64(brw, HSW_CS_GPR(1), query->bo, offset);
195
196   offset += sizeof(uint64_t);
197   brw_load_register_mem64(brw, HSW_CS_GPR(2), query->bo, offset);
198
199   offset += sizeof(uint64_t);
200   brw_load_register_mem64(brw, HSW_CS_GPR(3), query->bo, offset);
201
202   offset += sizeof(uint64_t);
203   brw_load_register_mem64(brw, HSW_CS_GPR(4), query->bo, offset);
204}
205
206/*
207 * R3 = R4 - R3;
208 * R1 = R2 - R1;
209 * R1 = R3 - R1;
210 * R0 = R0 | R1;
211 */
212static void
213calc_overflow_for_stream(struct brw_context *brw)
214{
215   static const uint32_t maths[] = {
216      MI_MATH_ALU2(LOAD, SRCA, R4),
217      MI_MATH_ALU2(LOAD, SRCB, R3),
218      MI_MATH_ALU0(SUB),
219      MI_MATH_ALU2(STORE, R3, ACCU),
220      MI_MATH_ALU2(LOAD, SRCA, R2),
221      MI_MATH_ALU2(LOAD, SRCB, R1),
222      MI_MATH_ALU0(SUB),
223      MI_MATH_ALU2(STORE, R1, ACCU),
224      MI_MATH_ALU2(LOAD, SRCA, R3),
225      MI_MATH_ALU2(LOAD, SRCB, R1),
226      MI_MATH_ALU0(SUB),
227      MI_MATH_ALU2(STORE, R1, ACCU),
228      MI_MATH_ALU2(LOAD, SRCA, R1),
229      MI_MATH_ALU2(LOAD, SRCB, R0),
230      MI_MATH_ALU0(OR),
231      MI_MATH_ALU2(STORE, R0, ACCU),
232   };
233
234   BEGIN_BATCH(1 + ARRAY_SIZE(maths));
235   OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2));
236
237   for (int m = 0; m < ARRAY_SIZE(maths); m++)
238      OUT_BATCH(maths[m]);
239
240   ADVANCE_BATCH();
241}
242
243static void
244calc_overflow_to_gpr0(struct brw_context *brw, struct brw_query_object *query,
245                       int count)
246{
247   brw_load_register_imm64(brw, HSW_CS_GPR(0), 0ull);
248
249   for (int i = 0; i < count; i++) {
250      load_overflow_data_to_cs_gprs(brw, query, i);
251      calc_overflow_for_stream(brw);
252   }
253}
254
255/*
256 * Take a query and calculate whether there was overflow during transform
257 * feedback. Store the result in the gpr0 register.
258 */
259void
260hsw_overflow_result_to_gpr0(struct brw_context *brw,
261                            struct brw_query_object *query,
262                            int count)
263{
264   calc_overflow_to_gpr0(brw, query, count);
265   gpr0_to_bool(brw);
266}
267
268static void
269hsw_result_to_gpr0(struct gl_context *ctx, struct brw_query_object *query,
270                   struct gl_buffer_object *buf, intptr_t offset,
271                   GLenum pname, GLenum ptype)
272{
273   struct brw_context *brw = brw_context(ctx);
274   const struct intel_device_info *devinfo = &brw->screen->devinfo;
275
276   assert(query->bo);
277   assert(pname != GL_QUERY_TARGET);
278
279   if (pname == GL_QUERY_RESULT_AVAILABLE) {
280      /* The query result availability is stored at offset 0 of the buffer. */
281      brw_load_register_mem64(brw,
282                              HSW_CS_GPR(0),
283                              query->bo,
284                              2 * sizeof(uint64_t));
285      return;
286   }
287
288   if (pname == GL_QUERY_RESULT) {
289      /* Since GL_QUERY_RESULT_NO_WAIT wasn't used, they want us to stall to
290       * make sure the query is available.
291       */
292      brw_emit_pipe_control_flush(brw,
293                                  PIPE_CONTROL_CS_STALL |
294                                  PIPE_CONTROL_STALL_AT_SCOREBOARD);
295   }
296
297   if (query->Base.Target == GL_TIMESTAMP) {
298      brw_load_register_mem64(brw,
299                              HSW_CS_GPR(0),
300                              query->bo,
301                              0 * sizeof(uint64_t));
302   } else if (query->Base.Target == GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB
303              || query->Base.Target == GL_TRANSFORM_FEEDBACK_OVERFLOW_ARB) {
304      /* Don't do anything in advance here, since the math for this is a little
305       * more complex.
306       */
307   } else {
308      brw_load_register_mem64(brw,
309                              HSW_CS_GPR(1),
310                              query->bo,
311                              0 * sizeof(uint64_t));
312      brw_load_register_mem64(brw,
313                              HSW_CS_GPR(2),
314                              query->bo,
315                              1 * sizeof(uint64_t));
316
317      BEGIN_BATCH(5);
318      OUT_BATCH(HSW_MI_MATH | (5 - 2));
319
320      OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2));
321      OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
322      OUT_BATCH(MI_MATH_ALU0(SUB));
323      OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
324
325      ADVANCE_BATCH();
326   }
327
328   switch (query->Base.Target) {
329   case GL_FRAGMENT_SHADER_INVOCATIONS_ARB:
330      /* Implement the "WaDividePSInvocationCountBy4:HSW,BDW" workaround:
331       * "Invocation counter is 4 times actual.  WA: SW to divide HW reported
332       *  PS Invocations value by 4."
333       *
334       * Prior to Haswell, invocation count was counted by the WM, and it
335       * buggily counted invocations in units of subspans (2x2 unit). To get the
336       * correct value, the CS multiplied this by 4. With HSW the logic moved,
337       * and correctly emitted the number of pixel shader invocations, but,
338       * whomever forgot to undo the multiply by 4.
339       */
340      if (devinfo->ver == 8 || devinfo->is_haswell)
341         shr_gpr0_by_2_bits(brw);
342      break;
343   case GL_TIME_ELAPSED:
344   case GL_TIMESTAMP:
345      mult_gpr0_by_80(brw);
346      if (query->Base.Target == GL_TIMESTAMP) {
347         keep_gpr0_lower_n_bits(brw, 36);
348      }
349      break;
350   case GL_ANY_SAMPLES_PASSED:
351   case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
352      gpr0_to_bool(brw);
353      break;
354   case GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB:
355      hsw_overflow_result_to_gpr0(brw, query, 1);
356      break;
357   case GL_TRANSFORM_FEEDBACK_OVERFLOW_ARB:
358      hsw_overflow_result_to_gpr0(brw, query, MAX_VERTEX_STREAMS);
359      break;
360   }
361}
362
363/*
364 * Store immediate data into the user buffer using the requested size.
365 */
366static void
367store_query_result_imm(struct brw_context *brw, struct brw_bo *bo,
368                       uint32_t offset, GLenum ptype, uint64_t imm)
369{
370   switch (ptype) {
371   case GL_INT:
372   case GL_UNSIGNED_INT:
373      brw_store_data_imm32(brw, bo, offset, imm);
374      break;
375   case GL_INT64_ARB:
376   case GL_UNSIGNED_INT64_ARB:
377      brw_store_data_imm64(brw, bo, offset, imm);
378      break;
379   default:
380      unreachable("Unexpected result type");
381   }
382}
383
384static void
385set_predicate(struct brw_context *brw, struct brw_bo *query_bo)
386{
387   brw_load_register_imm64(brw, MI_PREDICATE_SRC1, 0ull);
388
389   /* Load query availability into SRC0 */
390   brw_load_register_mem64(brw, MI_PREDICATE_SRC0, query_bo,
391                           2 * sizeof(uint64_t));
392
393   /* predicate = !(query_availability == 0); */
394   BEGIN_BATCH(1);
395   OUT_BATCH(GFX7_MI_PREDICATE |
396             MI_PREDICATE_LOADOP_LOADINV |
397             MI_PREDICATE_COMBINEOP_SET |
398             MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
399   ADVANCE_BATCH();
400}
401
402/*
403 * Store data from the register into the user buffer using the requested size.
404 * The write also enables the predication to prevent writing the result if the
405 * query has not finished yet.
406 */
407static void
408store_query_result_reg(struct brw_context *brw, struct brw_bo *bo,
409                       uint32_t offset, GLenum ptype, uint32_t reg,
410                       const bool pipelined)
411{
412   const struct intel_device_info *devinfo = &brw->screen->devinfo;
413   uint32_t cmd_size = devinfo->ver >= 8 ? 4 : 3;
414   uint32_t dwords = (ptype == GL_INT || ptype == GL_UNSIGNED_INT) ? 1 : 2;
415   assert(devinfo->ver >= 6);
416
417   BEGIN_BATCH(dwords * cmd_size);
418   for (int i = 0; i < dwords; i++) {
419      OUT_BATCH(MI_STORE_REGISTER_MEM |
420                (pipelined ? MI_STORE_REGISTER_MEM_PREDICATE : 0) |
421                (cmd_size - 2));
422      OUT_BATCH(reg + 4 * i);
423      if (devinfo->ver >= 8) {
424         OUT_RELOC64(bo, RELOC_WRITE, offset + 4 * i);
425      } else {
426         OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + 4 * i);
427      }
428   }
429   ADVANCE_BATCH();
430}
431
432static void
433hsw_store_query_result(struct gl_context *ctx, struct gl_query_object *q,
434                       struct gl_buffer_object *buf, intptr_t offset,
435                       GLenum pname, GLenum ptype)
436{
437   struct brw_context *brw = brw_context(ctx);
438   struct brw_query_object *query = (struct brw_query_object *)q;
439   struct brw_buffer_object *bo = brw_buffer_object(buf);
440   const bool pipelined = brw_is_query_pipelined(query);
441
442   if (pname == GL_QUERY_TARGET) {
443      store_query_result_imm(brw, bo->buffer, offset, ptype,
444                             query->Base.Target);
445      return;
446   } else if (pname == GL_QUERY_RESULT_AVAILABLE && !pipelined) {
447      store_query_result_imm(brw, bo->buffer, offset, ptype, 1ull);
448   } else if (query->bo) {
449      /* The query bo still around. Therefore, we:
450       *
451       *  1. Compute the current result in GPR0
452       *  2. Set the command streamer predicate based on query availability
453       *  3. (With predication) Write GPR0 to the requested buffer
454       */
455      hsw_result_to_gpr0(ctx, query, buf, offset, pname, ptype);
456      if (pipelined)
457         set_predicate(brw, query->bo);
458      store_query_result_reg(brw, bo->buffer, offset, ptype, HSW_CS_GPR(0),
459                             pipelined);
460   } else {
461      /* The query bo is gone, so the query must have been processed into
462       * client memory. In this case we can fill the buffer location with the
463       * requested data using MI_STORE_DATA_IMM.
464       */
465      switch (pname) {
466      case GL_QUERY_RESULT_AVAILABLE:
467         store_query_result_imm(brw, bo->buffer, offset, ptype, 1ull);
468         break;
469      case GL_QUERY_RESULT_NO_WAIT:
470      case GL_QUERY_RESULT:
471         store_query_result_imm(brw, bo->buffer, offset, ptype,
472                                q->Result);
473         break;
474      default:
475         unreachable("Unexpected result type");
476      }
477   }
478
479}
480
481/* Initialize hsw+-specific query object functions. */
482void hsw_init_queryobj_functions(struct dd_function_table *functions)
483{
484   gfx6_init_queryobj_functions(functions);
485   functions->StoreQueryResult = hsw_store_query_result;
486}
487