1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2017 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice shall be included 12b8e80941Smrg * in all copies or substantial portions of the Software. 13b8e80941Smrg * 14b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15b8e80941Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20b8e80941Smrg * DEALINGS IN THE SOFTWARE. 21b8e80941Smrg */ 22b8e80941Smrg 23b8e80941Smrg/** 24b8e80941Smrg * @file iris_query.c 25b8e80941Smrg * 26b8e80941Smrg * Query object support. This allows measuring various simple statistics 27b8e80941Smrg * via counters on the GPU. 28b8e80941Smrg */ 29b8e80941Smrg 30b8e80941Smrg#include <stdio.h> 31b8e80941Smrg#include <errno.h> 32b8e80941Smrg#include "pipe/p_defines.h" 33b8e80941Smrg#include "pipe/p_state.h" 34b8e80941Smrg#include "pipe/p_context.h" 35b8e80941Smrg#include "pipe/p_screen.h" 36b8e80941Smrg#include "util/fast_idiv_by_const.h" 37b8e80941Smrg#include "util/u_inlines.h" 38b8e80941Smrg#include "util/u_upload_mgr.h" 39b8e80941Smrg#include "iris_context.h" 40b8e80941Smrg#include "iris_defines.h" 41b8e80941Smrg#include "iris_fence.h" 42b8e80941Smrg#include "iris_resource.h" 43b8e80941Smrg#include "iris_screen.h" 44b8e80941Smrg#include "vulkan/util/vk_util.h" 45b8e80941Smrg 46b8e80941Smrg#define IA_VERTICES_COUNT 0x2310 47b8e80941Smrg#define IA_PRIMITIVES_COUNT 0x2318 48b8e80941Smrg#define VS_INVOCATION_COUNT 0x2320 49b8e80941Smrg#define HS_INVOCATION_COUNT 0x2300 50b8e80941Smrg#define DS_INVOCATION_COUNT 0x2308 51b8e80941Smrg#define GS_INVOCATION_COUNT 0x2328 52b8e80941Smrg#define GS_PRIMITIVES_COUNT 0x2330 53b8e80941Smrg#define CL_INVOCATION_COUNT 0x2338 54b8e80941Smrg#define CL_PRIMITIVES_COUNT 0x2340 55b8e80941Smrg#define PS_INVOCATION_COUNT 0x2348 56b8e80941Smrg#define CS_INVOCATION_COUNT 0x2290 57b8e80941Smrg#define PS_DEPTH_COUNT 0x2350 58b8e80941Smrg 59b8e80941Smrg#define SO_PRIM_STORAGE_NEEDED(n) (0x5240 + (n) * 8) 60b8e80941Smrg 61b8e80941Smrg#define SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8) 62b8e80941Smrg 63b8e80941Smrg#define MI_MATH (0x1a << 23) 64b8e80941Smrg 65b8e80941Smrg#define MI_ALU_LOAD 0x080 66b8e80941Smrg#define MI_ALU_LOADINV 0x480 67b8e80941Smrg#define MI_ALU_LOAD0 0x081 68b8e80941Smrg#define MI_ALU_LOAD1 0x481 69b8e80941Smrg#define MI_ALU_ADD 0x100 70b8e80941Smrg#define MI_ALU_SUB 0x101 71b8e80941Smrg#define MI_ALU_AND 0x102 72b8e80941Smrg#define MI_ALU_OR 0x103 73b8e80941Smrg#define MI_ALU_XOR 0x104 74b8e80941Smrg#define MI_ALU_STORE 0x180 75b8e80941Smrg#define MI_ALU_STOREINV 0x580 76b8e80941Smrg 77b8e80941Smrg#define MI_ALU_R0 0x00 78b8e80941Smrg#define MI_ALU_R1 0x01 79b8e80941Smrg#define MI_ALU_R2 0x02 80b8e80941Smrg#define MI_ALU_R3 0x03 81b8e80941Smrg#define MI_ALU_R4 0x04 82b8e80941Smrg#define MI_ALU_SRCA 0x20 83b8e80941Smrg#define MI_ALU_SRCB 0x21 84b8e80941Smrg#define MI_ALU_ACCU 0x31 85b8e80941Smrg#define MI_ALU_ZF 0x32 86b8e80941Smrg#define MI_ALU_CF 0x33 87b8e80941Smrg 88b8e80941Smrg#define _MI_ALU(op, x, y) (((op) << 20) | ((x) << 10) | (y)) 89b8e80941Smrg 90b8e80941Smrg#define _MI_ALU0(op) _MI_ALU(MI_ALU_##op, 0, 0) 91b8e80941Smrg#define _MI_ALU1(op, x) _MI_ALU(MI_ALU_##op, x, 0) 92b8e80941Smrg#define _MI_ALU2(op, x, y) _MI_ALU(MI_ALU_##op, x, y) 93b8e80941Smrg 94b8e80941Smrg#define MI_ALU0(op) _MI_ALU0(op) 95b8e80941Smrg#define MI_ALU1(op, x) _MI_ALU1(op, MI_ALU_##x) 96b8e80941Smrg#define MI_ALU2(op, x, y) _MI_ALU2(op, MI_ALU_##x, MI_ALU_##y) 97b8e80941Smrg 98b8e80941Smrg#define emit_lri32 ice->vtbl.load_register_imm32 99b8e80941Smrg#define emit_lri64 ice->vtbl.load_register_imm64 100b8e80941Smrg#define emit_lrr32 ice->vtbl.load_register_reg32 101b8e80941Smrg 102b8e80941Smrgstruct iris_query { 103b8e80941Smrg enum pipe_query_type type; 104b8e80941Smrg int index; 105b8e80941Smrg 106b8e80941Smrg bool ready; 107b8e80941Smrg 108b8e80941Smrg bool stalled; 109b8e80941Smrg 110b8e80941Smrg uint64_t result; 111b8e80941Smrg 112b8e80941Smrg struct iris_state_ref query_state_ref; 113b8e80941Smrg struct iris_query_snapshots *map; 114b8e80941Smrg struct iris_syncpt *syncpt; 115b8e80941Smrg 116b8e80941Smrg int batch_idx; 117b8e80941Smrg}; 118b8e80941Smrg 119b8e80941Smrgstruct iris_query_snapshots { 120b8e80941Smrg /** iris_render_condition's saved MI_PREDICATE_RESULT value. */ 121b8e80941Smrg uint64_t predicate_result; 122b8e80941Smrg 123b8e80941Smrg /** Have the start/end snapshots landed? */ 124b8e80941Smrg uint64_t snapshots_landed; 125b8e80941Smrg 126b8e80941Smrg /** Starting and ending counter snapshots */ 127b8e80941Smrg uint64_t start; 128b8e80941Smrg uint64_t end; 129b8e80941Smrg}; 130b8e80941Smrg 131b8e80941Smrgstruct iris_query_so_overflow { 132b8e80941Smrg uint64_t predicate_result; 133b8e80941Smrg uint64_t snapshots_landed; 134b8e80941Smrg 135b8e80941Smrg struct { 136b8e80941Smrg uint64_t prim_storage_needed[2]; 137b8e80941Smrg uint64_t num_prims[2]; 138b8e80941Smrg } stream[4]; 139b8e80941Smrg}; 140b8e80941Smrg 141b8e80941Smrg/** 142b8e80941Smrg * Is this type of query written by PIPE_CONTROL? 143b8e80941Smrg */ 144b8e80941Smrgstatic bool 145b8e80941Smrgiris_is_query_pipelined(struct iris_query *q) 146b8e80941Smrg{ 147b8e80941Smrg switch (q->type) { 148b8e80941Smrg case PIPE_QUERY_OCCLUSION_COUNTER: 149b8e80941Smrg case PIPE_QUERY_OCCLUSION_PREDICATE: 150b8e80941Smrg case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: 151b8e80941Smrg case PIPE_QUERY_TIMESTAMP: 152b8e80941Smrg case PIPE_QUERY_TIMESTAMP_DISJOINT: 153b8e80941Smrg case PIPE_QUERY_TIME_ELAPSED: 154b8e80941Smrg return true; 155b8e80941Smrg 156b8e80941Smrg default: 157b8e80941Smrg return false; 158b8e80941Smrg } 159b8e80941Smrg} 160b8e80941Smrg 161b8e80941Smrgstatic void 162b8e80941Smrgmark_available(struct iris_context *ice, struct iris_query *q) 163b8e80941Smrg{ 164b8e80941Smrg struct iris_batch *batch = &ice->batches[q->batch_idx]; 165b8e80941Smrg unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE; 166b8e80941Smrg unsigned offset = offsetof(struct iris_query_snapshots, snapshots_landed); 167b8e80941Smrg struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res); 168b8e80941Smrg offset += q->query_state_ref.offset; 169b8e80941Smrg 170b8e80941Smrg if (!iris_is_query_pipelined(q)) { 171b8e80941Smrg ice->vtbl.store_data_imm64(batch, bo, offset, true); 172b8e80941Smrg } else { 173b8e80941Smrg /* Order available *after* the query results. */ 174b8e80941Smrg flags |= PIPE_CONTROL_FLUSH_ENABLE; 175b8e80941Smrg iris_emit_pipe_control_write(batch, flags, bo, offset, true); 176b8e80941Smrg } 177b8e80941Smrg} 178b8e80941Smrg 179b8e80941Smrg/** 180b8e80941Smrg * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL. 181b8e80941Smrg */ 182b8e80941Smrgstatic void 183b8e80941Smrgiris_pipelined_write(struct iris_batch *batch, 184b8e80941Smrg struct iris_query *q, 185b8e80941Smrg enum pipe_control_flags flags, 186b8e80941Smrg unsigned offset) 187b8e80941Smrg{ 188b8e80941Smrg const struct gen_device_info *devinfo = &batch->screen->devinfo; 189b8e80941Smrg const unsigned optional_cs_stall = 190b8e80941Smrg devinfo->gen == 9 && devinfo->gt == 4 ? PIPE_CONTROL_CS_STALL : 0; 191b8e80941Smrg struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res); 192b8e80941Smrg 193b8e80941Smrg iris_emit_pipe_control_write(batch, flags | optional_cs_stall, 194b8e80941Smrg bo, offset, 0ull); 195b8e80941Smrg} 196b8e80941Smrg 197b8e80941Smrgstatic void 198b8e80941Smrgwrite_value(struct iris_context *ice, struct iris_query *q, unsigned offset) 199b8e80941Smrg{ 200b8e80941Smrg struct iris_batch *batch = &ice->batches[q->batch_idx]; 201b8e80941Smrg const struct gen_device_info *devinfo = &batch->screen->devinfo; 202b8e80941Smrg struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res); 203b8e80941Smrg 204b8e80941Smrg if (!iris_is_query_pipelined(q)) { 205b8e80941Smrg iris_emit_pipe_control_flush(batch, 206b8e80941Smrg PIPE_CONTROL_CS_STALL | 207b8e80941Smrg PIPE_CONTROL_STALL_AT_SCOREBOARD); 208b8e80941Smrg q->stalled = true; 209b8e80941Smrg } 210b8e80941Smrg 211b8e80941Smrg switch (q->type) { 212b8e80941Smrg case PIPE_QUERY_OCCLUSION_COUNTER: 213b8e80941Smrg case PIPE_QUERY_OCCLUSION_PREDICATE: 214b8e80941Smrg case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: 215b8e80941Smrg if (devinfo->gen >= 10) { 216b8e80941Smrg /* "Driver must program PIPE_CONTROL with only Depth Stall Enable 217b8e80941Smrg * bit set prior to programming a PIPE_CONTROL with Write PS Depth 218b8e80941Smrg * Count sync operation." 219b8e80941Smrg */ 220b8e80941Smrg iris_emit_pipe_control_flush(batch, PIPE_CONTROL_DEPTH_STALL); 221b8e80941Smrg } 222b8e80941Smrg iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q, 223b8e80941Smrg PIPE_CONTROL_WRITE_DEPTH_COUNT | 224b8e80941Smrg PIPE_CONTROL_DEPTH_STALL, 225b8e80941Smrg offset); 226b8e80941Smrg break; 227b8e80941Smrg case PIPE_QUERY_TIME_ELAPSED: 228b8e80941Smrg case PIPE_QUERY_TIMESTAMP: 229b8e80941Smrg case PIPE_QUERY_TIMESTAMP_DISJOINT: 230b8e80941Smrg iris_pipelined_write(&ice->batches[IRIS_BATCH_RENDER], q, 231b8e80941Smrg PIPE_CONTROL_WRITE_TIMESTAMP, 232b8e80941Smrg offset); 233b8e80941Smrg break; 234b8e80941Smrg case PIPE_QUERY_PRIMITIVES_GENERATED: 235b8e80941Smrg ice->vtbl.store_register_mem64(batch, 236b8e80941Smrg q->index == 0 ? CL_INVOCATION_COUNT : 237b8e80941Smrg SO_PRIM_STORAGE_NEEDED(q->index), 238b8e80941Smrg bo, offset, false); 239b8e80941Smrg break; 240b8e80941Smrg case PIPE_QUERY_PRIMITIVES_EMITTED: 241b8e80941Smrg ice->vtbl.store_register_mem64(batch, 242b8e80941Smrg SO_NUM_PRIMS_WRITTEN(q->index), 243b8e80941Smrg bo, offset, false); 244b8e80941Smrg break; 245b8e80941Smrg case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: { 246b8e80941Smrg static const uint32_t index_to_reg[] = { 247b8e80941Smrg IA_VERTICES_COUNT, 248b8e80941Smrg IA_PRIMITIVES_COUNT, 249b8e80941Smrg VS_INVOCATION_COUNT, 250b8e80941Smrg GS_INVOCATION_COUNT, 251b8e80941Smrg GS_PRIMITIVES_COUNT, 252b8e80941Smrg CL_INVOCATION_COUNT, 253b8e80941Smrg CL_PRIMITIVES_COUNT, 254b8e80941Smrg PS_INVOCATION_COUNT, 255b8e80941Smrg HS_INVOCATION_COUNT, 256b8e80941Smrg DS_INVOCATION_COUNT, 257b8e80941Smrg CS_INVOCATION_COUNT, 258b8e80941Smrg }; 259b8e80941Smrg const uint32_t reg = index_to_reg[q->index]; 260b8e80941Smrg 261b8e80941Smrg ice->vtbl.store_register_mem64(batch, reg, bo, offset, false); 262b8e80941Smrg break; 263b8e80941Smrg } 264b8e80941Smrg default: 265b8e80941Smrg assert(false); 266b8e80941Smrg } 267b8e80941Smrg} 268b8e80941Smrg 269b8e80941Smrgstatic void 270b8e80941Smrgwrite_overflow_values(struct iris_context *ice, struct iris_query *q, bool end) 271b8e80941Smrg{ 272b8e80941Smrg struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; 273b8e80941Smrg uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4; 274b8e80941Smrg struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res); 275b8e80941Smrg uint32_t offset = q->query_state_ref.offset; 276b8e80941Smrg 277b8e80941Smrg iris_emit_pipe_control_flush(batch, 278b8e80941Smrg PIPE_CONTROL_CS_STALL | 279b8e80941Smrg PIPE_CONTROL_STALL_AT_SCOREBOARD); 280b8e80941Smrg for (uint32_t i = 0; i < count; i++) { 281b8e80941Smrg int s = q->index + i; 282b8e80941Smrg int g_idx = offset + offsetof(struct iris_query_so_overflow, 283b8e80941Smrg stream[s].num_prims[end]); 284b8e80941Smrg int w_idx = offset + offsetof(struct iris_query_so_overflow, 285b8e80941Smrg stream[s].prim_storage_needed[end]); 286b8e80941Smrg ice->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s), 287b8e80941Smrg bo, g_idx, false); 288b8e80941Smrg ice->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s), 289b8e80941Smrg bo, w_idx, false); 290b8e80941Smrg } 291b8e80941Smrg} 292b8e80941Smrg 293b8e80941Smrguint64_t 294b8e80941Smrgiris_timebase_scale(const struct gen_device_info *devinfo, 295b8e80941Smrg uint64_t gpu_timestamp) 296b8e80941Smrg{ 297b8e80941Smrg return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency; 298b8e80941Smrg} 299b8e80941Smrg 300b8e80941Smrgstatic uint64_t 301b8e80941Smrgiris_raw_timestamp_delta(uint64_t time0, uint64_t time1) 302b8e80941Smrg{ 303b8e80941Smrg if (time0 > time1) { 304b8e80941Smrg return (1ULL << TIMESTAMP_BITS) + time1 - time0; 305b8e80941Smrg } else { 306b8e80941Smrg return time1 - time0; 307b8e80941Smrg } 308b8e80941Smrg} 309b8e80941Smrg 310b8e80941Smrgstatic bool 311b8e80941Smrgstream_overflowed(struct iris_query_so_overflow *so, int s) 312b8e80941Smrg{ 313b8e80941Smrg return (so->stream[s].prim_storage_needed[1] - 314b8e80941Smrg so->stream[s].prim_storage_needed[0]) != 315b8e80941Smrg (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]); 316b8e80941Smrg} 317b8e80941Smrg 318b8e80941Smrgstatic void 319b8e80941Smrgcalculate_result_on_cpu(const struct gen_device_info *devinfo, 320b8e80941Smrg struct iris_query *q) 321b8e80941Smrg{ 322b8e80941Smrg switch (q->type) { 323b8e80941Smrg case PIPE_QUERY_OCCLUSION_PREDICATE: 324b8e80941Smrg case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: 325b8e80941Smrg q->result = q->map->end != q->map->start; 326b8e80941Smrg break; 327b8e80941Smrg case PIPE_QUERY_TIMESTAMP: 328b8e80941Smrg case PIPE_QUERY_TIMESTAMP_DISJOINT: 329b8e80941Smrg /* The timestamp is the single starting snapshot. */ 330b8e80941Smrg q->result = iris_timebase_scale(devinfo, q->map->start); 331b8e80941Smrg q->result &= (1ull << TIMESTAMP_BITS) - 1; 332b8e80941Smrg break; 333b8e80941Smrg case PIPE_QUERY_TIME_ELAPSED: 334b8e80941Smrg q->result = iris_raw_timestamp_delta(q->map->start, q->map->end); 335b8e80941Smrg q->result = iris_timebase_scale(devinfo, q->result); 336b8e80941Smrg q->result &= (1ull << TIMESTAMP_BITS) - 1; 337b8e80941Smrg break; 338b8e80941Smrg case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 339b8e80941Smrg q->result = stream_overflowed((void *) q->map, q->index); 340b8e80941Smrg break; 341b8e80941Smrg case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 342b8e80941Smrg q->result = false; 343b8e80941Smrg for (int i = 0; i < MAX_VERTEX_STREAMS; i++) 344b8e80941Smrg q->result |= stream_overflowed((void *) q->map, i); 345b8e80941Smrg break; 346b8e80941Smrg case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: 347b8e80941Smrg q->result = q->map->end - q->map->start; 348b8e80941Smrg 349b8e80941Smrg /* WaDividePSInvocationCountBy4:HSW,BDW */ 350b8e80941Smrg if (devinfo->gen == 8 && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS) 351b8e80941Smrg q->result /= 4; 352b8e80941Smrg break; 353b8e80941Smrg case PIPE_QUERY_OCCLUSION_COUNTER: 354b8e80941Smrg case PIPE_QUERY_PRIMITIVES_GENERATED: 355b8e80941Smrg case PIPE_QUERY_PRIMITIVES_EMITTED: 356b8e80941Smrg default: 357b8e80941Smrg q->result = q->map->end - q->map->start; 358b8e80941Smrg break; 359b8e80941Smrg } 360b8e80941Smrg 361b8e80941Smrg q->ready = true; 362b8e80941Smrg} 363b8e80941Smrg 364b8e80941Smrgstatic void 365b8e80941Smrgemit_alu_add(struct iris_batch *batch, unsigned dst_reg, 366b8e80941Smrg unsigned reg_a, unsigned reg_b) 367b8e80941Smrg{ 368b8e80941Smrg uint32_t *math = iris_get_command_space(batch, 5 * sizeof(uint32_t)); 369b8e80941Smrg 370b8e80941Smrg math[0] = MI_MATH | (5 - 2); 371b8e80941Smrg math[1] = _MI_ALU2(LOAD, MI_ALU_SRCA, reg_a); 372b8e80941Smrg math[2] = _MI_ALU2(LOAD, MI_ALU_SRCB, reg_b); 373b8e80941Smrg math[3] = _MI_ALU0(ADD); 374b8e80941Smrg math[4] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU); 375b8e80941Smrg} 376b8e80941Smrg 377b8e80941Smrgstatic void 378b8e80941Smrgemit_alu_shl(struct iris_batch *batch, unsigned dst_reg, 379b8e80941Smrg unsigned src_reg, unsigned shift) 380b8e80941Smrg{ 381b8e80941Smrg assert(shift > 0); 382b8e80941Smrg 383b8e80941Smrg int dwords = 1 + 4 * shift; 384b8e80941Smrg 385b8e80941Smrg uint32_t *math = iris_get_command_space(batch, sizeof(uint32_t) * dwords); 386b8e80941Smrg 387b8e80941Smrg math[0] = MI_MATH | ((1 + 4 * shift) - 2); 388b8e80941Smrg 389b8e80941Smrg for (unsigned i = 0; i < shift; i++) { 390b8e80941Smrg unsigned add_src = (i == 0) ? src_reg : dst_reg; 391b8e80941Smrg math[1 + (i * 4) + 0] = _MI_ALU2(LOAD, MI_ALU_SRCA, add_src); 392b8e80941Smrg math[1 + (i * 4) + 1] = _MI_ALU2(LOAD, MI_ALU_SRCB, add_src); 393b8e80941Smrg math[1 + (i * 4) + 2] = _MI_ALU0(ADD); 394b8e80941Smrg math[1 + (i * 4) + 3] = _MI_ALU2(STORE, dst_reg, MI_ALU_ACCU); 395b8e80941Smrg } 396b8e80941Smrg} 397b8e80941Smrg 398b8e80941Smrg/* Emit dwords to multiply GPR0 by N */ 399b8e80941Smrgstatic void 400b8e80941Smrgbuild_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N) 401b8e80941Smrg{ 402b8e80941Smrg VK_OUTARRAY_MAKE(out, dw, dw_count); 403b8e80941Smrg 404b8e80941Smrg#define APPEND_ALU(op, x, y) \ 405b8e80941Smrg vk_outarray_append(&out, alu_dw) *alu_dw = _MI_ALU(MI_ALU_##op, x, y) 406b8e80941Smrg 407b8e80941Smrg assert(N > 0); 408b8e80941Smrg unsigned top_bit = 31 - __builtin_clz(N); 409b8e80941Smrg for (int i = top_bit - 1; i >= 0; i--) { 410b8e80941Smrg /* We get our initial data in GPR0 and we write the final data out to 411b8e80941Smrg * GPR0 but we use GPR1 as our scratch register. 412b8e80941Smrg */ 413b8e80941Smrg unsigned src_reg = i == top_bit - 1 ? MI_ALU_R0 : MI_ALU_R1; 414b8e80941Smrg unsigned dst_reg = i == 0 ? MI_ALU_R0 : MI_ALU_R1; 415b8e80941Smrg 416b8e80941Smrg /* Shift the current value left by 1 */ 417b8e80941Smrg APPEND_ALU(LOAD, MI_ALU_SRCA, src_reg); 418b8e80941Smrg APPEND_ALU(LOAD, MI_ALU_SRCB, src_reg); 419b8e80941Smrg APPEND_ALU(ADD, 0, 0); 420b8e80941Smrg 421b8e80941Smrg if (N & (1 << i)) { 422b8e80941Smrg /* Store ACCU to R1 and add R0 to R1 */ 423b8e80941Smrg APPEND_ALU(STORE, MI_ALU_R1, MI_ALU_ACCU); 424b8e80941Smrg APPEND_ALU(LOAD, MI_ALU_SRCA, MI_ALU_R0); 425b8e80941Smrg APPEND_ALU(LOAD, MI_ALU_SRCB, MI_ALU_R1); 426b8e80941Smrg APPEND_ALU(ADD, 0, 0); 427b8e80941Smrg } 428b8e80941Smrg 429b8e80941Smrg APPEND_ALU(STORE, dst_reg, MI_ALU_ACCU); 430b8e80941Smrg } 431b8e80941Smrg 432b8e80941Smrg#undef APPEND_ALU 433b8e80941Smrg} 434b8e80941Smrg 435b8e80941Smrgstatic void 436b8e80941Smrgemit_mul_gpr0(struct iris_batch *batch, uint32_t N) 437b8e80941Smrg{ 438b8e80941Smrg uint32_t num_dwords; 439b8e80941Smrg build_alu_multiply_gpr0(NULL, &num_dwords, N); 440b8e80941Smrg 441b8e80941Smrg uint32_t *math = iris_get_command_space(batch, 4 * num_dwords); 442b8e80941Smrg math[0] = MI_MATH | (num_dwords - 2); 443b8e80941Smrg build_alu_multiply_gpr0(&math[1], &num_dwords, N); 444b8e80941Smrg} 445b8e80941Smrg 446b8e80941Smrgvoid 447b8e80941Smrgiris_math_div32_gpr0(struct iris_context *ice, 448b8e80941Smrg struct iris_batch *batch, 449b8e80941Smrg uint32_t D) 450b8e80941Smrg{ 451b8e80941Smrg /* Zero out the top of GPR0 */ 452b8e80941Smrg emit_lri32(batch, CS_GPR(0) + 4, 0); 453b8e80941Smrg 454b8e80941Smrg if (D == 0) { 455b8e80941Smrg /* This invalid, but we should do something so we set GPR0 to 0. */ 456b8e80941Smrg emit_lri32(batch, CS_GPR(0), 0); 457b8e80941Smrg } else if (util_is_power_of_two_or_zero(D)) { 458b8e80941Smrg unsigned log2_D = util_logbase2(D); 459b8e80941Smrg assert(log2_D < 32); 460b8e80941Smrg /* We right-shift by log2(D) by left-shifting by 32 - log2(D) and taking 461b8e80941Smrg * the top 32 bits of the result. 462b8e80941Smrg */ 463b8e80941Smrg emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - log2_D); 464b8e80941Smrg emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4); 465b8e80941Smrg emit_lri32(batch, CS_GPR(0) + 4, 0); 466b8e80941Smrg } else { 467b8e80941Smrg struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32); 468b8e80941Smrg assert(m.multiplier <= UINT32_MAX); 469b8e80941Smrg 470b8e80941Smrg if (m.pre_shift) { 471b8e80941Smrg /* We right-shift by L by left-shifting by 32 - l and taking the top 472b8e80941Smrg * 32 bits of the result. 473b8e80941Smrg */ 474b8e80941Smrg if (m.pre_shift < 32) 475b8e80941Smrg emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.pre_shift); 476b8e80941Smrg emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4); 477b8e80941Smrg emit_lri32(batch, CS_GPR(0) + 4, 0); 478b8e80941Smrg } 479b8e80941Smrg 480b8e80941Smrg /* Do the 32x32 multiply into gpr0 */ 481b8e80941Smrg emit_mul_gpr0(batch, m.multiplier); 482b8e80941Smrg 483b8e80941Smrg if (m.increment) { 484b8e80941Smrg /* If we need to increment, save off a copy of GPR0 */ 485b8e80941Smrg emit_lri32(batch, CS_GPR(1) + 0, m.multiplier); 486b8e80941Smrg emit_lri32(batch, CS_GPR(1) + 4, 0); 487b8e80941Smrg emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1); 488b8e80941Smrg } 489b8e80941Smrg 490b8e80941Smrg /* Shift by 32 */ 491b8e80941Smrg emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4); 492b8e80941Smrg emit_lri32(batch, CS_GPR(0) + 4, 0); 493b8e80941Smrg 494b8e80941Smrg if (m.post_shift) { 495b8e80941Smrg /* We right-shift by L by left-shifting by 32 - l and taking the top 496b8e80941Smrg * 32 bits of the result. 497b8e80941Smrg */ 498b8e80941Smrg if (m.post_shift < 32) 499b8e80941Smrg emit_alu_shl(batch, MI_ALU_R0, MI_ALU_R0, 32 - m.post_shift); 500b8e80941Smrg emit_lrr32(batch, CS_GPR(0) + 0, CS_GPR(0) + 4); 501b8e80941Smrg emit_lri32(batch, CS_GPR(0) + 4, 0); 502b8e80941Smrg } 503b8e80941Smrg } 504b8e80941Smrg} 505b8e80941Smrg 506b8e80941Smrgvoid 507b8e80941Smrgiris_math_add32_gpr0(struct iris_context *ice, 508b8e80941Smrg struct iris_batch *batch, 509b8e80941Smrg uint32_t x) 510b8e80941Smrg{ 511b8e80941Smrg emit_lri32(batch, CS_GPR(1), x); 512b8e80941Smrg emit_alu_add(batch, MI_ALU_R0, MI_ALU_R0, MI_ALU_R1); 513b8e80941Smrg} 514b8e80941Smrg 515b8e80941Smrg/* 516b8e80941Smrg * GPR0 = (GPR0 == 0) ? 0 : 1; 517b8e80941Smrg */ 518b8e80941Smrgstatic void 519b8e80941Smrggpr0_to_bool(struct iris_context *ice) 520b8e80941Smrg{ 521b8e80941Smrg struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; 522b8e80941Smrg 523b8e80941Smrg ice->vtbl.load_register_imm64(batch, CS_GPR(1), 1ull); 524b8e80941Smrg 525b8e80941Smrg static const uint32_t math[] = { 526b8e80941Smrg MI_MATH | (9 - 2), 527b8e80941Smrg MI_ALU2(LOAD, SRCA, R0), 528b8e80941Smrg MI_ALU1(LOAD0, SRCB), 529b8e80941Smrg MI_ALU0(ADD), 530b8e80941Smrg MI_ALU2(STOREINV, R0, ZF), 531b8e80941Smrg MI_ALU2(LOAD, SRCA, R0), 532b8e80941Smrg MI_ALU2(LOAD, SRCB, R1), 533b8e80941Smrg MI_ALU0(AND), 534b8e80941Smrg MI_ALU2(STORE, R0, ACCU), 535b8e80941Smrg }; 536b8e80941Smrg iris_batch_emit(batch, math, sizeof(math)); 537b8e80941Smrg} 538b8e80941Smrg 539b8e80941Smrgstatic void 540b8e80941Smrgload_overflow_data_to_cs_gprs(struct iris_context *ice, 541b8e80941Smrg struct iris_query *q, 542b8e80941Smrg int idx) 543b8e80941Smrg{ 544b8e80941Smrg struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; 545b8e80941Smrg struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res); 546b8e80941Smrg uint32_t offset = q->query_state_ref.offset; 547b8e80941Smrg 548b8e80941Smrg ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo, offset + 549b8e80941Smrg offsetof(struct iris_query_so_overflow, 550b8e80941Smrg stream[idx].prim_storage_needed[0])); 551b8e80941Smrg ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo, offset + 552b8e80941Smrg offsetof(struct iris_query_so_overflow, 553b8e80941Smrg stream[idx].prim_storage_needed[1])); 554b8e80941Smrg 555b8e80941Smrg ice->vtbl.load_register_mem64(batch, CS_GPR(3), bo, offset + 556b8e80941Smrg offsetof(struct iris_query_so_overflow, 557b8e80941Smrg stream[idx].num_prims[0])); 558b8e80941Smrg ice->vtbl.load_register_mem64(batch, CS_GPR(4), bo, offset + 559b8e80941Smrg offsetof(struct iris_query_so_overflow, 560b8e80941Smrg stream[idx].num_prims[1])); 561b8e80941Smrg} 562b8e80941Smrg 563b8e80941Smrg/* 564b8e80941Smrg * R3 = R4 - R3; 565b8e80941Smrg * R1 = R2 - R1; 566b8e80941Smrg * R1 = R3 - R1; 567b8e80941Smrg * R0 = R0 | R1; 568b8e80941Smrg */ 569b8e80941Smrgstatic void 570b8e80941Smrgcalc_overflow_for_stream(struct iris_context *ice) 571b8e80941Smrg{ 572b8e80941Smrg struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; 573b8e80941Smrg static const uint32_t maths[] = { 574b8e80941Smrg MI_MATH | (17 - 2), 575b8e80941Smrg MI_ALU2(LOAD, SRCA, R4), 576b8e80941Smrg MI_ALU2(LOAD, SRCB, R3), 577b8e80941Smrg MI_ALU0(SUB), 578b8e80941Smrg MI_ALU2(STORE, R3, ACCU), 579b8e80941Smrg MI_ALU2(LOAD, SRCA, R2), 580b8e80941Smrg MI_ALU2(LOAD, SRCB, R1), 581b8e80941Smrg MI_ALU0(SUB), 582b8e80941Smrg MI_ALU2(STORE, R1, ACCU), 583b8e80941Smrg MI_ALU2(LOAD, SRCA, R3), 584b8e80941Smrg MI_ALU2(LOAD, SRCB, R1), 585b8e80941Smrg MI_ALU0(SUB), 586b8e80941Smrg MI_ALU2(STORE, R1, ACCU), 587b8e80941Smrg MI_ALU2(LOAD, SRCA, R1), 588b8e80941Smrg MI_ALU2(LOAD, SRCB, R0), 589b8e80941Smrg MI_ALU0(OR), 590b8e80941Smrg MI_ALU2(STORE, R0, ACCU), 591b8e80941Smrg }; 592b8e80941Smrg 593b8e80941Smrg iris_batch_emit(batch, maths, sizeof(maths)); 594b8e80941Smrg} 595b8e80941Smrg 596b8e80941Smrgstatic void 597b8e80941Smrgoverflow_result_to_gpr0(struct iris_context *ice, struct iris_query *q) 598b8e80941Smrg{ 599b8e80941Smrg struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; 600b8e80941Smrg 601b8e80941Smrg ice->vtbl.load_register_imm64(batch, CS_GPR(0), 0ull); 602b8e80941Smrg 603b8e80941Smrg if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) { 604b8e80941Smrg load_overflow_data_to_cs_gprs(ice, q, q->index); 605b8e80941Smrg calc_overflow_for_stream(ice); 606b8e80941Smrg } else { 607b8e80941Smrg for (int i = 0; i < MAX_VERTEX_STREAMS; i++) { 608b8e80941Smrg load_overflow_data_to_cs_gprs(ice, q, i); 609b8e80941Smrg calc_overflow_for_stream(ice); 610b8e80941Smrg } 611b8e80941Smrg } 612b8e80941Smrg 613b8e80941Smrg gpr0_to_bool(ice); 614b8e80941Smrg} 615b8e80941Smrg 616b8e80941Smrg/* 617b8e80941Smrg * GPR0 = GPR0 & ((1ull << n) -1); 618b8e80941Smrg */ 619b8e80941Smrgstatic void 620b8e80941Smrgkeep_gpr0_lower_n_bits(struct iris_context *ice, uint32_t n) 621b8e80941Smrg{ 622b8e80941Smrg struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; 623b8e80941Smrg 624b8e80941Smrg ice->vtbl.load_register_imm64(batch, CS_GPR(1), (1ull << n) - 1); 625b8e80941Smrg static const uint32_t math[] = { 626b8e80941Smrg MI_MATH | (5 - 2), 627b8e80941Smrg MI_ALU2(LOAD, SRCA, R0), 628b8e80941Smrg MI_ALU2(LOAD, SRCB, R1), 629b8e80941Smrg MI_ALU0(AND), 630b8e80941Smrg MI_ALU2(STORE, R0, ACCU), 631b8e80941Smrg }; 632b8e80941Smrg iris_batch_emit(batch, math, sizeof(math)); 633b8e80941Smrg} 634b8e80941Smrg 635b8e80941Smrg/* 636b8e80941Smrg * GPR0 = GPR0 << 30; 637b8e80941Smrg */ 638b8e80941Smrgstatic void 639b8e80941Smrgshl_gpr0_by_30_bits(struct iris_context *ice) 640b8e80941Smrg{ 641b8e80941Smrg struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; 642b8e80941Smrg /* First we mask 34 bits of GPR0 to prevent overflow */ 643b8e80941Smrg keep_gpr0_lower_n_bits(ice, 34); 644b8e80941Smrg 645b8e80941Smrg static const uint32_t shl_math[] = { 646b8e80941Smrg MI_ALU2(LOAD, SRCA, R0), 647b8e80941Smrg MI_ALU2(LOAD, SRCB, R0), 648b8e80941Smrg MI_ALU0(ADD), 649b8e80941Smrg MI_ALU2(STORE, R0, ACCU), 650b8e80941Smrg }; 651b8e80941Smrg 652b8e80941Smrg const uint32_t outer_count = 5; 653b8e80941Smrg const uint32_t inner_count = 6; 654b8e80941Smrg const uint32_t cmd_len = 1 + inner_count * ARRAY_SIZE(shl_math); 655b8e80941Smrg const uint32_t batch_len = cmd_len * outer_count; 656b8e80941Smrg uint32_t *map = iris_get_command_space(batch, batch_len * 4); 657b8e80941Smrg uint32_t offset = 0; 658b8e80941Smrg for (int o = 0; o < outer_count; o++) { 659b8e80941Smrg map[offset++] = MI_MATH | (cmd_len - 2); 660b8e80941Smrg for (int i = 0; i < inner_count; i++) { 661b8e80941Smrg memcpy(&map[offset], shl_math, sizeof(shl_math)); 662b8e80941Smrg offset += 4; 663b8e80941Smrg } 664b8e80941Smrg } 665b8e80941Smrg} 666b8e80941Smrg 667b8e80941Smrg/* 668b8e80941Smrg * GPR0 = GPR0 >> 2; 669b8e80941Smrg * 670b8e80941Smrg * Note that the upper 30 bits of GPR0 are lost! 671b8e80941Smrg */ 672b8e80941Smrgstatic void 673b8e80941Smrgshr_gpr0_by_2_bits(struct iris_context *ice) 674b8e80941Smrg{ 675b8e80941Smrg struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; 676b8e80941Smrg shl_gpr0_by_30_bits(ice); 677b8e80941Smrg ice->vtbl.load_register_reg32(batch, CS_GPR(0) + 4, CS_GPR(0)); 678b8e80941Smrg ice->vtbl.load_register_imm32(batch, CS_GPR(0) + 4, 0); 679b8e80941Smrg} 680b8e80941Smrg 681b8e80941Smrg/** 682b8e80941Smrg * Calculate the result and store it to CS_GPR0. 683b8e80941Smrg */ 684b8e80941Smrgstatic void 685b8e80941Smrgcalculate_result_on_gpu(struct iris_context *ice, struct iris_query *q) 686b8e80941Smrg{ 687b8e80941Smrg struct iris_batch *batch = &ice->batches[q->batch_idx]; 688b8e80941Smrg struct iris_screen *screen = (void *) ice->ctx.screen; 689b8e80941Smrg const struct gen_device_info *devinfo = &batch->screen->devinfo; 690b8e80941Smrg struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res); 691b8e80941Smrg uint32_t offset = q->query_state_ref.offset; 692b8e80941Smrg 693b8e80941Smrg if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || 694b8e80941Smrg q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { 695b8e80941Smrg overflow_result_to_gpr0(ice, q); 696b8e80941Smrg return; 697b8e80941Smrg } 698b8e80941Smrg 699b8e80941Smrg if (q->type == PIPE_QUERY_TIMESTAMP) { 700b8e80941Smrg ice->vtbl.load_register_mem64(batch, CS_GPR(0), bo, 701b8e80941Smrg offset + 702b8e80941Smrg offsetof(struct iris_query_snapshots, start)); 703b8e80941Smrg /* TODO: This discards any fractional bits of the timebase scale. 704b8e80941Smrg * We would need to do a bit of fixed point math on the CS ALU, or 705b8e80941Smrg * launch an actual shader to calculate this with full precision. 706b8e80941Smrg */ 707b8e80941Smrg emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency)); 708b8e80941Smrg keep_gpr0_lower_n_bits(ice, 36); 709b8e80941Smrg return; 710b8e80941Smrg } 711b8e80941Smrg 712b8e80941Smrg ice->vtbl.load_register_mem64(batch, CS_GPR(1), bo, 713b8e80941Smrg offset + 714b8e80941Smrg offsetof(struct iris_query_snapshots, start)); 715b8e80941Smrg ice->vtbl.load_register_mem64(batch, CS_GPR(2), bo, 716b8e80941Smrg offset + 717b8e80941Smrg offsetof(struct iris_query_snapshots, end)); 718b8e80941Smrg 719b8e80941Smrg static const uint32_t math[] = { 720b8e80941Smrg MI_MATH | (5 - 2), 721b8e80941Smrg MI_ALU2(LOAD, SRCA, R2), 722b8e80941Smrg MI_ALU2(LOAD, SRCB, R1), 723b8e80941Smrg MI_ALU0(SUB), 724b8e80941Smrg MI_ALU2(STORE, R0, ACCU), 725b8e80941Smrg }; 726b8e80941Smrg iris_batch_emit(batch, math, sizeof(math)); 727b8e80941Smrg 728b8e80941Smrg /* WaDividePSInvocationCountBy4:HSW,BDW */ 729b8e80941Smrg if (devinfo->gen == 8 && 730b8e80941Smrg q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE && 731b8e80941Smrg q->index == PIPE_STAT_QUERY_PS_INVOCATIONS) 732b8e80941Smrg shr_gpr0_by_2_bits(ice); 733b8e80941Smrg 734b8e80941Smrg if (q->type == PIPE_QUERY_OCCLUSION_PREDICATE || 735b8e80941Smrg q->type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) 736b8e80941Smrg gpr0_to_bool(ice); 737b8e80941Smrg 738b8e80941Smrg if (q->type == PIPE_QUERY_TIME_ELAPSED) { 739b8e80941Smrg /* TODO: This discards fractional bits (see above). */ 740b8e80941Smrg emit_mul_gpr0(batch, (1000000000ull / screen->devinfo.timestamp_frequency)); 741b8e80941Smrg } 742b8e80941Smrg} 743b8e80941Smrg 744b8e80941Smrgstatic struct pipe_query * 745b8e80941Smrgiris_create_query(struct pipe_context *ctx, 746b8e80941Smrg unsigned query_type, 747b8e80941Smrg unsigned index) 748b8e80941Smrg{ 749b8e80941Smrg struct iris_query *q = calloc(1, sizeof(struct iris_query)); 750b8e80941Smrg 751b8e80941Smrg q->type = query_type; 752b8e80941Smrg q->index = index; 753b8e80941Smrg 754b8e80941Smrg if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE && 755b8e80941Smrg q->index == PIPE_STAT_QUERY_CS_INVOCATIONS) 756b8e80941Smrg q->batch_idx = IRIS_BATCH_COMPUTE; 757b8e80941Smrg else 758b8e80941Smrg q->batch_idx = IRIS_BATCH_RENDER; 759b8e80941Smrg return (struct pipe_query *) q; 760b8e80941Smrg} 761b8e80941Smrg 762b8e80941Smrgstatic void 763b8e80941Smrgiris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query) 764b8e80941Smrg{ 765b8e80941Smrg struct iris_query *query = (void *) p_query; 766b8e80941Smrg struct iris_screen *screen = (void *) ctx->screen; 767b8e80941Smrg iris_syncpt_reference(screen, &query->syncpt, NULL); 768b8e80941Smrg free(query); 769b8e80941Smrg} 770b8e80941Smrg 771b8e80941Smrg 772b8e80941Smrgstatic boolean 773b8e80941Smrgiris_begin_query(struct pipe_context *ctx, struct pipe_query *query) 774b8e80941Smrg{ 775b8e80941Smrg struct iris_context *ice = (void *) ctx; 776b8e80941Smrg struct iris_query *q = (void *) query; 777b8e80941Smrg void *ptr = NULL; 778b8e80941Smrg uint32_t size; 779b8e80941Smrg 780b8e80941Smrg if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || 781b8e80941Smrg q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) 782b8e80941Smrg size = sizeof(struct iris_query_so_overflow); 783b8e80941Smrg else 784b8e80941Smrg size = sizeof(struct iris_query_snapshots); 785b8e80941Smrg 786b8e80941Smrg u_upload_alloc(ice->query_buffer_uploader, 0, 787b8e80941Smrg size, size, &q->query_state_ref.offset, 788b8e80941Smrg &q->query_state_ref.res, &ptr); 789b8e80941Smrg 790b8e80941Smrg if (!iris_resource_bo(q->query_state_ref.res)) 791b8e80941Smrg return false; 792b8e80941Smrg 793b8e80941Smrg q->map = ptr; 794b8e80941Smrg if (!q->map) 795b8e80941Smrg return false; 796b8e80941Smrg 797b8e80941Smrg q->result = 0ull; 798b8e80941Smrg q->ready = false; 799b8e80941Smrg WRITE_ONCE(q->map->snapshots_landed, false); 800b8e80941Smrg 801b8e80941Smrg if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) { 802b8e80941Smrg ice->state.prims_generated_query_active = true; 803b8e80941Smrg ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP; 804b8e80941Smrg } 805b8e80941Smrg 806b8e80941Smrg if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || 807b8e80941Smrg q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) 808b8e80941Smrg write_overflow_values(ice, q, false); 809b8e80941Smrg else 810b8e80941Smrg write_value(ice, q, 811b8e80941Smrg q->query_state_ref.offset + 812b8e80941Smrg offsetof(struct iris_query_snapshots, start)); 813b8e80941Smrg 814b8e80941Smrg return true; 815b8e80941Smrg} 816b8e80941Smrg 817b8e80941Smrgstatic bool 818b8e80941Smrgiris_end_query(struct pipe_context *ctx, struct pipe_query *query) 819b8e80941Smrg{ 820b8e80941Smrg struct iris_context *ice = (void *) ctx; 821b8e80941Smrg struct iris_query *q = (void *) query; 822b8e80941Smrg struct iris_batch *batch = &ice->batches[q->batch_idx]; 823b8e80941Smrg 824b8e80941Smrg if (q->type == PIPE_QUERY_TIMESTAMP) { 825b8e80941Smrg iris_begin_query(ctx, query); 826b8e80941Smrg iris_batch_reference_signal_syncpt(batch, &q->syncpt); 827b8e80941Smrg mark_available(ice, q); 828b8e80941Smrg return true; 829b8e80941Smrg } 830b8e80941Smrg 831b8e80941Smrg if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) { 832b8e80941Smrg ice->state.prims_generated_query_active = false; 833b8e80941Smrg ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP; 834b8e80941Smrg } 835b8e80941Smrg 836b8e80941Smrg if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || 837b8e80941Smrg q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) 838b8e80941Smrg write_overflow_values(ice, q, true); 839b8e80941Smrg else 840b8e80941Smrg write_value(ice, q, 841b8e80941Smrg q->query_state_ref.offset + 842b8e80941Smrg offsetof(struct iris_query_snapshots, end)); 843b8e80941Smrg 844b8e80941Smrg iris_batch_reference_signal_syncpt(batch, &q->syncpt); 845b8e80941Smrg mark_available(ice, q); 846b8e80941Smrg 847b8e80941Smrg return true; 848b8e80941Smrg} 849b8e80941Smrg 850b8e80941Smrg/** 851b8e80941Smrg * See if the snapshots have landed for a query, and if so, compute the 852b8e80941Smrg * result and mark it ready. Does not flush (unlike iris_get_query_result). 853b8e80941Smrg */ 854b8e80941Smrgstatic void 855b8e80941Smrgiris_check_query_no_flush(struct iris_context *ice, struct iris_query *q) 856b8e80941Smrg{ 857b8e80941Smrg struct iris_screen *screen = (void *) ice->ctx.screen; 858b8e80941Smrg const struct gen_device_info *devinfo = &screen->devinfo; 859b8e80941Smrg 860b8e80941Smrg if (!q->ready && READ_ONCE(q->map->snapshots_landed)) { 861b8e80941Smrg calculate_result_on_cpu(devinfo, q); 862b8e80941Smrg } 863b8e80941Smrg} 864b8e80941Smrg 865b8e80941Smrgstatic boolean 866b8e80941Smrgiris_get_query_result(struct pipe_context *ctx, 867b8e80941Smrg struct pipe_query *query, 868b8e80941Smrg boolean wait, 869b8e80941Smrg union pipe_query_result *result) 870b8e80941Smrg{ 871b8e80941Smrg struct iris_context *ice = (void *) ctx; 872b8e80941Smrg struct iris_query *q = (void *) query; 873b8e80941Smrg struct iris_screen *screen = (void *) ctx->screen; 874b8e80941Smrg const struct gen_device_info *devinfo = &screen->devinfo; 875b8e80941Smrg struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res); 876b8e80941Smrg 877b8e80941Smrg if (!q->ready) { 878b8e80941Smrg if (iris_batch_references(&ice->batches[q->batch_idx], bo)) 879b8e80941Smrg iris_batch_flush(&ice->batches[q->batch_idx]); 880b8e80941Smrg 881b8e80941Smrg while (!READ_ONCE(q->map->snapshots_landed)) { 882b8e80941Smrg if (wait) 883b8e80941Smrg iris_wait_syncpt(ctx->screen, q->syncpt, INT64_MAX); 884b8e80941Smrg else 885b8e80941Smrg return false; 886b8e80941Smrg } 887b8e80941Smrg 888b8e80941Smrg assert(READ_ONCE(q->map->snapshots_landed)); 889b8e80941Smrg calculate_result_on_cpu(devinfo, q); 890b8e80941Smrg } 891b8e80941Smrg 892b8e80941Smrg assert(q->ready); 893b8e80941Smrg 894b8e80941Smrg result->u64 = q->result; 895b8e80941Smrg 896b8e80941Smrg return true; 897b8e80941Smrg} 898b8e80941Smrg 899b8e80941Smrgstatic void 900b8e80941Smrgiris_get_query_result_resource(struct pipe_context *ctx, 901b8e80941Smrg struct pipe_query *query, 902b8e80941Smrg boolean wait, 903b8e80941Smrg enum pipe_query_value_type result_type, 904b8e80941Smrg int index, 905b8e80941Smrg struct pipe_resource *p_res, 906b8e80941Smrg unsigned offset) 907b8e80941Smrg{ 908b8e80941Smrg struct iris_context *ice = (void *) ctx; 909b8e80941Smrg struct iris_query *q = (void *) query; 910b8e80941Smrg struct iris_batch *batch = &ice->batches[q->batch_idx]; 911b8e80941Smrg const struct gen_device_info *devinfo = &batch->screen->devinfo; 912b8e80941Smrg struct iris_resource *res = (void *) p_res; 913b8e80941Smrg struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res); 914b8e80941Smrg unsigned snapshots_landed_offset = 915b8e80941Smrg offsetof(struct iris_query_snapshots, snapshots_landed); 916b8e80941Smrg 917b8e80941Smrg res->bind_history |= PIPE_BIND_QUERY_BUFFER; 918b8e80941Smrg 919b8e80941Smrg if (index == -1) { 920b8e80941Smrg /* They're asking for the availability of the result. If we still 921b8e80941Smrg * have commands queued up which produce the result, submit them 922b8e80941Smrg * now so that progress happens. Either way, copy the snapshots 923b8e80941Smrg * landed field to the destination resource. 924b8e80941Smrg */ 925b8e80941Smrg if (iris_batch_references(batch, bo)) 926b8e80941Smrg iris_batch_flush(batch); 927b8e80941Smrg 928b8e80941Smrg ice->vtbl.copy_mem_mem(batch, iris_resource_bo(p_res), offset, 929b8e80941Smrg bo, snapshots_landed_offset, 930b8e80941Smrg result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8); 931b8e80941Smrg return; 932b8e80941Smrg } 933b8e80941Smrg 934b8e80941Smrg if (!q->ready && READ_ONCE(q->map->snapshots_landed)) { 935b8e80941Smrg /* The final snapshots happen to have landed, so let's just compute 936b8e80941Smrg * the result on the CPU now... 937b8e80941Smrg */ 938b8e80941Smrg calculate_result_on_cpu(devinfo, q); 939b8e80941Smrg } 940b8e80941Smrg 941b8e80941Smrg if (q->ready) { 942b8e80941Smrg /* We happen to have the result on the CPU, so just copy it. */ 943b8e80941Smrg if (result_type <= PIPE_QUERY_TYPE_U32) { 944b8e80941Smrg ice->vtbl.store_data_imm32(batch, iris_resource_bo(p_res), offset, 945b8e80941Smrg q->result); 946b8e80941Smrg } else { 947b8e80941Smrg ice->vtbl.store_data_imm64(batch, iris_resource_bo(p_res), offset, 948b8e80941Smrg q->result); 949b8e80941Smrg } 950b8e80941Smrg 951b8e80941Smrg /* Make sure the result lands before they use bind the QBO elsewhere 952b8e80941Smrg * and use the result. 953b8e80941Smrg */ 954b8e80941Smrg // XXX: Why? i965 doesn't do this. 955b8e80941Smrg iris_emit_pipe_control_flush(batch, PIPE_CONTROL_CS_STALL); 956b8e80941Smrg return; 957b8e80941Smrg } 958b8e80941Smrg 959b8e80941Smrg /* Calculate the result to CS_GPR0 */ 960b8e80941Smrg calculate_result_on_gpu(ice, q); 961b8e80941Smrg 962b8e80941Smrg bool predicated = !wait && !q->stalled; 963b8e80941Smrg 964b8e80941Smrg if (predicated) { 965b8e80941Smrg ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull); 966b8e80941Smrg ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo, 967b8e80941Smrg snapshots_landed_offset); 968b8e80941Smrg uint32_t predicate = MI_PREDICATE | 969b8e80941Smrg MI_PREDICATE_LOADOP_LOADINV | 970b8e80941Smrg MI_PREDICATE_COMBINEOP_SET | 971b8e80941Smrg MI_PREDICATE_COMPAREOP_SRCS_EQUAL; 972b8e80941Smrg iris_batch_emit(batch, &predicate, sizeof(uint32_t)); 973b8e80941Smrg } 974b8e80941Smrg 975b8e80941Smrg if (result_type <= PIPE_QUERY_TYPE_U32) { 976b8e80941Smrg ice->vtbl.store_register_mem32(batch, CS_GPR(0), 977b8e80941Smrg iris_resource_bo(p_res), 978b8e80941Smrg offset, predicated); 979b8e80941Smrg } else { 980b8e80941Smrg ice->vtbl.store_register_mem64(batch, CS_GPR(0), 981b8e80941Smrg iris_resource_bo(p_res), 982b8e80941Smrg offset, predicated); 983b8e80941Smrg } 984b8e80941Smrg} 985b8e80941Smrg 986b8e80941Smrgstatic void 987b8e80941Smrgiris_set_active_query_state(struct pipe_context *ctx, boolean enable) 988b8e80941Smrg{ 989b8e80941Smrg struct iris_context *ice = (void *) ctx; 990b8e80941Smrg 991b8e80941Smrg if (ice->state.statistics_counters_enabled == enable) 992b8e80941Smrg return; 993b8e80941Smrg 994b8e80941Smrg // XXX: most packets aren't paying attention to this yet, because it'd 995b8e80941Smrg // have to be done dynamically at draw time, which is a pain 996b8e80941Smrg ice->state.statistics_counters_enabled = enable; 997b8e80941Smrg ice->state.dirty |= IRIS_DIRTY_CLIP | 998b8e80941Smrg IRIS_DIRTY_GS | 999b8e80941Smrg IRIS_DIRTY_RASTER | 1000b8e80941Smrg IRIS_DIRTY_STREAMOUT | 1001b8e80941Smrg IRIS_DIRTY_TCS | 1002b8e80941Smrg IRIS_DIRTY_TES | 1003b8e80941Smrg IRIS_DIRTY_VS | 1004b8e80941Smrg IRIS_DIRTY_WM; 1005b8e80941Smrg} 1006b8e80941Smrg 1007b8e80941Smrgstatic void 1008b8e80941Smrgset_predicate_enable(struct iris_context *ice, bool value) 1009b8e80941Smrg{ 1010b8e80941Smrg if (value) 1011b8e80941Smrg ice->state.predicate = IRIS_PREDICATE_STATE_RENDER; 1012b8e80941Smrg else 1013b8e80941Smrg ice->state.predicate = IRIS_PREDICATE_STATE_DONT_RENDER; 1014b8e80941Smrg} 1015b8e80941Smrg 1016b8e80941Smrgstatic void 1017b8e80941Smrgset_predicate_for_result(struct iris_context *ice, 1018b8e80941Smrg struct iris_query *q, 1019b8e80941Smrg bool inverted) 1020b8e80941Smrg{ 1021b8e80941Smrg struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; 1022b8e80941Smrg struct iris_bo *bo = iris_resource_bo(q->query_state_ref.res); 1023b8e80941Smrg 1024b8e80941Smrg /* The CPU doesn't have the query result yet; use hardware predication */ 1025b8e80941Smrg ice->state.predicate = IRIS_PREDICATE_STATE_USE_BIT; 1026b8e80941Smrg 1027b8e80941Smrg /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */ 1028b8e80941Smrg iris_emit_pipe_control_flush(batch, PIPE_CONTROL_FLUSH_ENABLE); 1029b8e80941Smrg q->stalled = true; 1030b8e80941Smrg 1031b8e80941Smrg switch (q->type) { 1032b8e80941Smrg case PIPE_QUERY_SO_OVERFLOW_PREDICATE: 1033b8e80941Smrg case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: 1034b8e80941Smrg overflow_result_to_gpr0(ice, q); 1035b8e80941Smrg 1036b8e80941Smrg ice->vtbl.load_register_reg64(batch, MI_PREDICATE_SRC0, CS_GPR(0)); 1037b8e80941Smrg ice->vtbl.load_register_imm64(batch, MI_PREDICATE_SRC1, 0ull); 1038b8e80941Smrg break; 1039b8e80941Smrg default: 1040b8e80941Smrg /* PIPE_QUERY_OCCLUSION_* */ 1041b8e80941Smrg ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo, 1042b8e80941Smrg offsetof(struct iris_query_snapshots, start) + 1043b8e80941Smrg q->query_state_ref.offset); 1044b8e80941Smrg ice->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo, 1045b8e80941Smrg offsetof(struct iris_query_snapshots, end) + 1046b8e80941Smrg q->query_state_ref.offset); 1047b8e80941Smrg break; 1048b8e80941Smrg } 1049b8e80941Smrg 1050b8e80941Smrg uint32_t mi_predicate = MI_PREDICATE | 1051b8e80941Smrg MI_PREDICATE_COMBINEOP_SET | 1052b8e80941Smrg MI_PREDICATE_COMPAREOP_SRCS_EQUAL | 1053b8e80941Smrg (inverted ? MI_PREDICATE_LOADOP_LOAD 1054b8e80941Smrg : MI_PREDICATE_LOADOP_LOADINV); 1055b8e80941Smrg iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t)); 1056b8e80941Smrg 1057b8e80941Smrg /* We immediately set the predicate on the render batch, as all the 1058b8e80941Smrg * counters come from 3D operations. However, we may need to predicate 1059b8e80941Smrg * a compute dispatch, which executes in a different GEM context and has 1060b8e80941Smrg * a different MI_PREDICATE_RESULT register. So, we save the result to 1061b8e80941Smrg * memory and reload it in iris_launch_grid. 1062b8e80941Smrg */ 1063b8e80941Smrg unsigned offset = q->query_state_ref.offset + 1064b8e80941Smrg offsetof(struct iris_query_snapshots, predicate_result); 1065b8e80941Smrg ice->vtbl.store_register_mem64(batch, MI_PREDICATE_RESULT, 1066b8e80941Smrg bo, offset, false); 1067b8e80941Smrg ice->state.compute_predicate = bo; 1068b8e80941Smrg} 1069b8e80941Smrg 1070b8e80941Smrgstatic void 1071b8e80941Smrgiris_render_condition(struct pipe_context *ctx, 1072b8e80941Smrg struct pipe_query *query, 1073b8e80941Smrg boolean condition, 1074b8e80941Smrg enum pipe_render_cond_flag mode) 1075b8e80941Smrg{ 1076b8e80941Smrg struct iris_context *ice = (void *) ctx; 1077b8e80941Smrg struct iris_query *q = (void *) query; 1078b8e80941Smrg 1079b8e80941Smrg /* The old condition isn't relevant; we'll update it if necessary */ 1080b8e80941Smrg ice->state.compute_predicate = NULL; 1081b8e80941Smrg ice->condition.query = q; 1082b8e80941Smrg ice->condition.condition = condition; 1083b8e80941Smrg 1084b8e80941Smrg if (!q) { 1085b8e80941Smrg ice->state.predicate = IRIS_PREDICATE_STATE_RENDER; 1086b8e80941Smrg return; 1087b8e80941Smrg } 1088b8e80941Smrg 1089b8e80941Smrg iris_check_query_no_flush(ice, q); 1090b8e80941Smrg 1091b8e80941Smrg if (q->result || q->ready) { 1092b8e80941Smrg set_predicate_enable(ice, (q->result != 0) ^ condition); 1093b8e80941Smrg } else { 1094b8e80941Smrg if (mode == PIPE_RENDER_COND_NO_WAIT || 1095b8e80941Smrg mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) { 1096b8e80941Smrg perf_debug(&ice->dbg, "Conditional rendering demoted from " 1097b8e80941Smrg "\"no wait\" to \"wait\"."); 1098b8e80941Smrg } 1099b8e80941Smrg set_predicate_for_result(ice, q, condition); 1100b8e80941Smrg } 1101b8e80941Smrg} 1102b8e80941Smrg 1103b8e80941Smrgvoid 1104b8e80941Smrgiris_resolve_conditional_render(struct iris_context *ice) 1105b8e80941Smrg{ 1106b8e80941Smrg struct pipe_context *ctx = (void *) ice; 1107b8e80941Smrg struct iris_query *q = ice->condition.query; 1108b8e80941Smrg struct pipe_query *query = (void *) q; 1109b8e80941Smrg union pipe_query_result result; 1110b8e80941Smrg 1111b8e80941Smrg if (ice->state.predicate != IRIS_PREDICATE_STATE_USE_BIT) 1112b8e80941Smrg return; 1113b8e80941Smrg 1114b8e80941Smrg assert(q); 1115b8e80941Smrg 1116b8e80941Smrg iris_get_query_result(ctx, query, true, &result); 1117b8e80941Smrg set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition); 1118b8e80941Smrg} 1119b8e80941Smrg 1120b8e80941Smrgvoid 1121b8e80941Smrgiris_init_query_functions(struct pipe_context *ctx) 1122b8e80941Smrg{ 1123b8e80941Smrg ctx->create_query = iris_create_query; 1124b8e80941Smrg ctx->destroy_query = iris_destroy_query; 1125b8e80941Smrg ctx->begin_query = iris_begin_query; 1126b8e80941Smrg ctx->end_query = iris_end_query; 1127b8e80941Smrg ctx->get_query_result = iris_get_query_result; 1128b8e80941Smrg ctx->get_query_result_resource = iris_get_query_result_resource; 1129b8e80941Smrg ctx->set_active_query_state = iris_set_active_query_state; 1130b8e80941Smrg ctx->render_condition = iris_render_condition; 1131b8e80941Smrg} 1132