1/* 2 * Copyright (C) 2017-2018 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#define GPU 600 28 29#include "ir3_context.h" 30#include "ir3_image.h" 31 32/* 33 * Handlers for instructions changed/added in a6xx: 34 * 35 * Starting with a6xx, isam and stbi is used for SSBOs as well; stbi and the 36 * atomic instructions (used for both SSBO and image) use a new instruction 37 * encoding compared to a4xx/a5xx. 38 */ 39 40/* src[] = { buffer_index, offset }. No const_index */ 41static void 42emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, 43 struct ir3_instruction **dst) 44{ 45 struct ir3_block *b = ctx->block; 46 struct ir3_instruction *offset; 47 struct ir3_instruction *ldib; 48 49 offset = ir3_get_src(ctx, &intr->src[2])[0]; 50 51 ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0); 52 ldib->dsts[0]->wrmask = MASK(intr->num_components); 53 ldib->cat6.iim_val = intr->num_components; 54 ldib->cat6.d = 1; 55 ldib->cat6.type = intr->dest.ssa.bit_size == 16 ? TYPE_U16 : TYPE_U32; 56 ldib->barrier_class = IR3_BARRIER_BUFFER_R; 57 ldib->barrier_conflict = IR3_BARRIER_BUFFER_W; 58 ir3_handle_bindless_cat6(ldib, intr->src[0]); 59 ir3_handle_nonuniform(ldib, intr); 60 61 ir3_split_dest(b, dst, ldib, 0, intr->num_components); 62} 63 64/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */ 65static void 66emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) 67{ 68 struct ir3_block *b = ctx->block; 69 struct ir3_instruction *stib, *val, *offset; 70 unsigned wrmask = nir_intrinsic_write_mask(intr); 71 unsigned ncomp = ffs(~wrmask) - 1; 72 73 assert(wrmask == BITFIELD_MASK(intr->num_components)); 74 75 /* src0 is offset, src1 is value: 76 */ 77 val = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp); 78 offset = ir3_get_src(ctx, &intr->src[3])[0]; 79 80 stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, val, 0); 81 stib->cat6.iim_val = ncomp; 82 stib->cat6.d = 1; 83 stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32; 84 stib->barrier_class = IR3_BARRIER_BUFFER_W; 85 stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 86 ir3_handle_bindless_cat6(stib, intr->src[1]); 87 ir3_handle_nonuniform(stib, intr); 88 89 array_insert(b, b->keeps, stib); 90} 91 92/* 93 * SSBO atomic intrinsics 94 * 95 * All of the SSBO atomic memory operations read a value from memory, 96 * compute a new value using one of the operations below, write the new 97 * value to memory, and return the original value read. 98 * 99 * All operations take 3 sources except CompSwap that takes 4. These 100 * sources represent: 101 * 102 * 0: The SSBO buffer index. 103 * 1: The offset into the SSBO buffer of the variable that the atomic 104 * operation will operate on. 105 * 2: The data parameter to the atomic function (i.e. the value to add 106 * in ssbo_atomic_add, etc). 107 * 3: For CompSwap only: the second data parameter. 108 */ 109static struct ir3_instruction * 110emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) 111{ 112 struct ir3_block *b = ctx->block; 113 struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy; 114 type_t type = TYPE_U32; 115 116 ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]); 117 118 data = ir3_get_src(ctx, &intr->src[2])[0]; 119 120 /* So this gets a bit creative: 121 * 122 * src0 - vecN offset/coords 123 * src1.x - is actually destination register 124 * src1.y - is 'data' except for cmpxchg where src2.y is 'compare' 125 * src1.z - is 'data' for cmpxchg 126 * 127 * The combining src and dest kinda doesn't work out so well with how 128 * scheduling and RA work. So we create a dummy src2 which is tied to the 129 * destination in RA (i.e. must be allocated to the same vec2/vec3 130 * register) and then immediately extract the first component. 131 * 132 * Note that nir already multiplies the offset by four 133 */ 134 dummy = create_immed(b, 0); 135 136 if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) { 137 src0 = ir3_get_src(ctx, &intr->src[4])[0]; 138 struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0]; 139 src1 = ir3_collect(b, dummy, compare, data); 140 } else { 141 src0 = ir3_get_src(ctx, &intr->src[3])[0]; 142 src1 = ir3_collect(b, dummy, data); 143 } 144 145 switch (intr->intrinsic) { 146 case nir_intrinsic_ssbo_atomic_add_ir3: 147 atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0); 148 break; 149 case nir_intrinsic_ssbo_atomic_imin_ir3: 150 atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0); 151 type = TYPE_S32; 152 break; 153 case nir_intrinsic_ssbo_atomic_umin_ir3: 154 atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0); 155 break; 156 case nir_intrinsic_ssbo_atomic_imax_ir3: 157 atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0); 158 type = TYPE_S32; 159 break; 160 case nir_intrinsic_ssbo_atomic_umax_ir3: 161 atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0); 162 break; 163 case nir_intrinsic_ssbo_atomic_and_ir3: 164 atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0); 165 break; 166 case nir_intrinsic_ssbo_atomic_or_ir3: 167 atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0); 168 break; 169 case nir_intrinsic_ssbo_atomic_xor_ir3: 170 atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0); 171 break; 172 case nir_intrinsic_ssbo_atomic_exchange_ir3: 173 atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0); 174 break; 175 case nir_intrinsic_ssbo_atomic_comp_swap_ir3: 176 atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0); 177 break; 178 default: 179 unreachable("boo"); 180 } 181 182 atomic->cat6.iim_val = 1; 183 atomic->cat6.d = 1; 184 atomic->cat6.type = type; 185 atomic->barrier_class = IR3_BARRIER_BUFFER_W; 186 atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 187 ir3_handle_bindless_cat6(atomic, intr->src[0]); 188 189 /* even if nothing consume the result, we can't DCE the instruction: */ 190 array_insert(b, b->keeps, atomic); 191 192 atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask; 193 ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]); 194 struct ir3_instruction *split; 195 ir3_split_dest(b, &split, atomic, 0, 1); 196 return split; 197} 198 199/* src[] = { deref, coord, sample_index }. const_index[] = {} */ 200static void 201emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr, 202 struct ir3_instruction **dst) 203{ 204 struct ir3_block *b = ctx->block; 205 struct ir3_instruction *ldib; 206 struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]); 207 unsigned ncoords = ir3_get_image_coords(intr, NULL); 208 209 ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0, 210 ir3_create_collect(b, coords, ncoords), 0); 211 ldib->dsts[0]->wrmask = MASK(intr->num_components); 212 ldib->cat6.iim_val = intr->num_components; 213 ldib->cat6.d = ncoords; 214 ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr); 215 ldib->cat6.typed = true; 216 ldib->barrier_class = IR3_BARRIER_IMAGE_R; 217 ldib->barrier_conflict = IR3_BARRIER_IMAGE_W; 218 ir3_handle_bindless_cat6(ldib, intr->src[0]); 219 ir3_handle_nonuniform(ldib, intr); 220 221 ir3_split_dest(b, dst, ldib, 0, intr->num_components); 222} 223 224/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */ 225static void 226emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) 227{ 228 struct ir3_block *b = ctx->block; 229 struct ir3_instruction *stib; 230 struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]); 231 struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]); 232 unsigned ncoords = ir3_get_image_coords(intr, NULL); 233 enum pipe_format format = nir_intrinsic_format(intr); 234 unsigned ncomp = ir3_get_num_components_for_image_format(format); 235 236 /* src0 is offset, src1 is value: 237 */ 238 stib = ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0, 239 ir3_create_collect(b, coords, ncoords), 0, 240 ir3_create_collect(b, value, ncomp), 0); 241 stib->cat6.iim_val = ncomp; 242 stib->cat6.d = ncoords; 243 stib->cat6.type = ir3_get_type_for_image_intrinsic(intr); 244 stib->cat6.typed = true; 245 stib->barrier_class = IR3_BARRIER_IMAGE_W; 246 stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; 247 ir3_handle_bindless_cat6(stib, intr->src[0]); 248 ir3_handle_nonuniform(stib, intr); 249 250 array_insert(b, b->keeps, stib); 251} 252 253/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */ 254static struct ir3_instruction * 255emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) 256{ 257 struct ir3_block *b = ctx->block; 258 struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy; 259 struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]); 260 struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0]; 261 unsigned ncoords = ir3_get_image_coords(intr, NULL); 262 263 ibo = ir3_image_to_ibo(ctx, intr->src[0]); 264 265 /* So this gets a bit creative: 266 * 267 * src0 - vecN offset/coords 268 * src1.x - is actually destination register 269 * src1.y - is 'value' except for cmpxchg where src2.y is 'compare' 270 * src1.z - is 'value' for cmpxchg 271 * 272 * The combining src and dest kinda doesn't work out so well with how 273 * scheduling and RA work. So we create a dummy src2 which is tied to the 274 * destination in RA (i.e. must be allocated to the same vec2/vec3 275 * register) and then immediately extract the first component. 276 */ 277 dummy = create_immed(b, 0); 278 src0 = ir3_create_collect(b, coords, ncoords); 279 280 if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap || 281 intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) { 282 struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0]; 283 src1 = ir3_collect(b, dummy, compare, value); 284 } else { 285 src1 = ir3_collect(b, dummy, value); 286 } 287 288 switch (intr->intrinsic) { 289 case nir_intrinsic_image_atomic_add: 290 case nir_intrinsic_bindless_image_atomic_add: 291 atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0); 292 break; 293 case nir_intrinsic_image_atomic_imin: 294 case nir_intrinsic_image_atomic_umin: 295 case nir_intrinsic_bindless_image_atomic_imin: 296 case nir_intrinsic_bindless_image_atomic_umin: 297 atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0); 298 break; 299 case nir_intrinsic_image_atomic_imax: 300 case nir_intrinsic_image_atomic_umax: 301 case nir_intrinsic_bindless_image_atomic_imax: 302 case nir_intrinsic_bindless_image_atomic_umax: 303 atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0); 304 break; 305 case nir_intrinsic_image_atomic_and: 306 case nir_intrinsic_bindless_image_atomic_and: 307 atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0); 308 break; 309 case nir_intrinsic_image_atomic_or: 310 case nir_intrinsic_bindless_image_atomic_or: 311 atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0); 312 break; 313 case nir_intrinsic_image_atomic_xor: 314 case nir_intrinsic_bindless_image_atomic_xor: 315 atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0); 316 break; 317 case nir_intrinsic_image_atomic_exchange: 318 case nir_intrinsic_bindless_image_atomic_exchange: 319 atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0); 320 break; 321 case nir_intrinsic_image_atomic_comp_swap: 322 case nir_intrinsic_bindless_image_atomic_comp_swap: 323 atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0); 324 break; 325 default: 326 unreachable("boo"); 327 } 328 329 atomic->cat6.iim_val = 1; 330 atomic->cat6.d = ncoords; 331 atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr); 332 atomic->cat6.typed = true; 333 atomic->barrier_class = IR3_BARRIER_IMAGE_W; 334 atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W; 335 ir3_handle_bindless_cat6(atomic, intr->src[0]); 336 337 /* even if nothing consume the result, we can't DCE the instruction: */ 338 array_insert(b, b->keeps, atomic); 339 340 atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask; 341 ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]); 342 struct ir3_instruction *split; 343 ir3_split_dest(b, &split, atomic, 0, 1); 344 return split; 345} 346 347static void 348emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, 349 struct ir3_instruction **dst) 350{ 351 struct ir3_block *b = ctx->block; 352 struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]); 353 struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0); 354 resinfo->cat6.iim_val = 1; 355 resinfo->cat6.d = intr->num_components; 356 resinfo->cat6.type = TYPE_U32; 357 resinfo->cat6.typed = false; 358 /* resinfo has no writemask and always writes out 3 components: */ 359 compile_assert(ctx, intr->num_components <= 3); 360 resinfo->dsts[0]->wrmask = MASK(3); 361 ir3_handle_bindless_cat6(resinfo, intr->src[0]); 362 ir3_handle_nonuniform(resinfo, intr); 363 364 ir3_split_dest(b, dst, resinfo, 0, intr->num_components); 365} 366 367static void 368emit_intrinsic_load_global_ir3(struct ir3_context *ctx, 369 nir_intrinsic_instr *intr, 370 struct ir3_instruction **dst) 371{ 372 struct ir3_block *b = ctx->block; 373 unsigned dest_components = nir_intrinsic_dest_components(intr); 374 struct ir3_instruction *addr, *offset; 375 376 addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0], 377 ir3_get_src(ctx, &intr->src[0])[1]); 378 379 offset = ir3_get_src(ctx, &intr->src[1])[0]; 380 381 struct ir3_instruction *load = 382 ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0, 383 create_immed(b, 0), 0, create_immed(b, dest_components), 0); 384 load->cat6.type = TYPE_U32; 385 load->dsts[0]->wrmask = MASK(dest_components); 386 387 load->barrier_class = IR3_BARRIER_BUFFER_R; 388 load->barrier_conflict = IR3_BARRIER_BUFFER_W; 389 390 ir3_split_dest(b, dst, load, 0, dest_components); 391} 392 393static void 394emit_intrinsic_store_global_ir3(struct ir3_context *ctx, 395 nir_intrinsic_instr *intr) 396{ 397 struct ir3_block *b = ctx->block; 398 struct ir3_instruction *value, *addr, *offset; 399 unsigned ncomp = nir_intrinsic_src_components(intr, 0); 400 401 addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[1])[0], 402 ir3_get_src(ctx, &intr->src[1])[1]); 403 404 offset = ir3_get_src(ctx, &intr->src[2])[0]; 405 406 value = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp); 407 408 struct ir3_instruction *stg = 409 ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0, 410 create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0); 411 stg->cat6.type = TYPE_U32; 412 stg->cat6.iim_val = 1; 413 414 array_insert(b, b->keeps, stg); 415 416 stg->barrier_class = IR3_BARRIER_BUFFER_W; 417 stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 418} 419 420const struct ir3_context_funcs ir3_a6xx_funcs = { 421 .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo, 422 .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo, 423 .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo, 424 .emit_intrinsic_load_image = emit_intrinsic_load_image, 425 .emit_intrinsic_store_image = emit_intrinsic_store_image, 426 .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image, 427 .emit_intrinsic_image_size = emit_intrinsic_image_size, 428 .emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3, 429 .emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3, 430}; 431