1/* 2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Jonathan Marek <jonathan@marek.ca> 25 */ 26 27#include "ir2_private.h" 28 29static unsigned 30src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp) 31{ 32 struct ir2_reg_component *comps; 33 unsigned swiz = 0; 34 35 switch (src->type) { 36 case IR2_SRC_SSA: 37 case IR2_SRC_REG: 38 break; 39 default: 40 return src->swizzle; 41 } 42 /* we need to take into account where the components were allocated */ 43 comps = get_reg_src(ctx, src)->comp; 44 for (int i = 0; i < ncomp; i++) { 45 swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i); 46 } 47 return swiz; 48} 49 50/* alu instr need to take into how the output components are allocated */ 51 52/* scalar doesn't need to take into account dest swizzle */ 53 54static unsigned 55alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg) 56{ 57 /* hardware seems to take from W, but swizzle everywhere just in case */ 58 return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX); 59} 60 61static unsigned 62alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, struct ir2_src *src) 63{ 64 struct ir2_reg_component *comp = get_reg(instr)->comp; 65 unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr)); 66 unsigned swiz = 0; 67 68 /* non per component special cases */ 69 switch (instr->alu.vector_opc) { 70 case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv: 71 return alu_swizzle_scalar(ctx, src); 72 case DOT2ADDv: 73 case DOT3v: 74 case DOT4v: 75 case CUBEv: 76 return swiz0; 77 default: 78 break; 79 } 80 81 for (int i = 0, j = 0; i < dst_ncomp(instr); j++) { 82 if (instr->alu.write_mask & 1 << j) { 83 if (comp[j].c != 7) 84 swiz |= swiz_set(i, comp[j].c); 85 i++; 86 } 87 } 88 return swiz_merge(swiz0, swiz); 89} 90 91static unsigned 92alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1) 93{ 94 /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */ 95 unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0); 96 return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY); 97} 98 99/* write_mask needs to be transformed by allocation information */ 100 101static unsigned 102alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr) 103{ 104 struct ir2_reg_component *comp = get_reg(instr)->comp; 105 unsigned write_mask = 0; 106 107 for (int i = 0; i < 4; i++) { 108 if (instr->alu.write_mask & 1 << i) 109 write_mask |= 1 << comp[i].c; 110 } 111 112 return write_mask; 113} 114 115/* fetch instructions can swizzle dest, but src swizzle needs conversion */ 116 117static unsigned 118fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp) 119{ 120 unsigned alu_swiz = src_swizzle(ctx, src, ncomp); 121 unsigned swiz = 0; 122 for (int i = 0; i < ncomp; i++) 123 swiz |= swiz_get(alu_swiz, i) << i * 2; 124 return swiz; 125} 126 127static unsigned 128fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr) 129{ 130 struct ir2_reg_component *comp = get_reg(instr)->comp; 131 unsigned dst_swiz = 0xfff; 132 for (int i = 0; i < dst_ncomp(instr); i++) { 133 dst_swiz &= ~(7 << comp[i].c * 3); 134 dst_swiz |= i << comp[i].c * 3; 135 } 136 return dst_swiz; 137} 138 139/* register / export # for instr */ 140static unsigned 141dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr) 142{ 143 if (is_export(instr)) 144 return instr->alu.export; 145 146 return get_reg(instr)->idx; 147} 148 149/* register # for src */ 150static unsigned src_to_reg(struct ir2_context *ctx, struct ir2_src *src) 151{ 152 return get_reg_src(ctx, src)->idx; 153} 154 155static unsigned src_reg_byte(struct ir2_context *ctx, struct ir2_src *src) 156{ 157 if (src->type == IR2_SRC_CONST) { 158 assert(!src->abs); /* no abs bit for const */ 159 return src->num; 160 } 161 return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0); 162} 163 164/* produce the 12 byte binary instruction for a given sched_instr */ 165static void 166fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, 167 instr_t *bc, bool * is_fetch) 168{ 169 struct ir2_instr *instr = sched->instr, *instr_s, *instr_v; 170 171 *bc = (instr_t) {}; 172 173 if (instr && instr->type == IR2_FETCH) { 174 *is_fetch = true; 175 176 bc->fetch.opc = instr->fetch.opc; 177 bc->fetch.pred_select = !!instr->pred; 178 bc->fetch.pred_condition = instr->pred & 1; 179 180 struct ir2_src *src = instr->src; 181 182 if (instr->fetch.opc == VTX_FETCH) { 183 instr_fetch_vtx_t *vtx = &bc->fetch.vtx; 184 185 assert(instr->fetch.vtx.const_idx <= 0x1f); 186 assert(instr->fetch.vtx.const_idx_sel <= 0x3); 187 188 vtx->src_reg = src_to_reg(ctx, src); 189 vtx->src_swiz = fetch_swizzle(ctx, src, 1); 190 vtx->dst_reg = dst_to_reg(ctx, instr); 191 vtx->dst_swiz = fetch_dst_swiz(ctx, instr); 192 193 vtx->must_be_one = 1; 194 vtx->const_index = instr->fetch.vtx.const_idx; 195 vtx->const_index_sel = instr->fetch.vtx.const_idx_sel; 196 197 /* other fields will be patched */ 198 199 /* XXX seems like every FETCH but the first has 200 * this bit set: 201 */ 202 vtx->reserved3 = instr->idx ? 0x1 : 0x0; 203 vtx->reserved0 = instr->idx ? 0x2 : 0x3; 204 } else if (instr->fetch.opc == TEX_FETCH) { 205 instr_fetch_tex_t *tex = &bc->fetch.tex; 206 207 tex->src_reg = src_to_reg(ctx, src); 208 tex->src_swiz = fetch_swizzle(ctx, src, 3); 209 tex->dst_reg = dst_to_reg(ctx, instr); 210 tex->dst_swiz = fetch_dst_swiz(ctx, instr); 211 /* tex->const_idx = patch_fetches */ 212 tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; 213 tex->min_filter = TEX_FILTER_USE_FETCH_CONST; 214 tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; 215 tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; 216 tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; 217 tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; 218 tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; 219 tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT; 220 tex->use_reg_lod = instr->src_count == 2; 221 tex->sample_location = SAMPLE_CENTER; 222 tex->tx_coord_denorm = instr->fetch.tex.is_rect; 223 } else if (instr->fetch.opc == TEX_SET_TEX_LOD) { 224 instr_fetch_tex_t *tex = &bc->fetch.tex; 225 226 tex->src_reg = src_to_reg(ctx, src); 227 tex->src_swiz = fetch_swizzle(ctx, src, 1); 228 tex->dst_reg = 0; 229 tex->dst_swiz = 0xfff; 230 231 tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; 232 tex->min_filter = TEX_FILTER_USE_FETCH_CONST; 233 tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; 234 tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; 235 tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; 236 tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; 237 tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; 238 tex->use_comp_lod = 1; 239 tex->use_reg_lod = 0; 240 tex->sample_location = SAMPLE_CENTER; 241 } else { 242 assert(0); 243 } 244 return; 245 } 246 247 instr_v = sched->instr; 248 instr_s = sched->instr_s; 249 250 if (instr_v) { 251 struct ir2_src src1, src2, *src3; 252 253 src1 = instr_v->src[0]; 254 src2 = instr_v->src[instr_v->src_count > 1]; 255 src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL; 256 257 bc->alu.vector_opc = instr_v->alu.vector_opc; 258 bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v); 259 bc->alu.vector_dest = dst_to_reg(ctx, instr_v); 260 bc->alu.vector_clamp = instr_v->alu.saturate; 261 bc->alu.export_data = instr_v->alu.export >= 0; 262 263 /* single operand SETEv, use 0.0f as src2 */ 264 if (instr_v->src_count == 1 && 265 (bc->alu.vector_opc == SETEv || 266 bc->alu.vector_opc == SETNEv || 267 bc->alu.vector_opc == SETGTv || 268 bc->alu.vector_opc == SETGTEv)) 269 src2 = ir2_zero(ctx); 270 271 /* export32 instr for a20x hw binning has this bit set.. 272 * it seems to do more than change the base address of constants 273 * XXX this is a hack 274 */ 275 bc->alu.relative_addr = 276 (bc->alu.export_data && bc->alu.vector_dest == 32); 277 278 bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1); 279 bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1); 280 bc->alu.src1_reg_negate = src1.negate; 281 bc->alu.src1_sel = src1.type != IR2_SRC_CONST; 282 283 bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2); 284 bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2); 285 bc->alu.src2_reg_negate = src2.negate; 286 bc->alu.src2_sel = src2.type != IR2_SRC_CONST; 287 288 if (src3) { 289 bc->alu.src3_reg_byte = src_reg_byte(ctx, src3); 290 bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3); 291 bc->alu.src3_reg_negate = src3->negate; 292 bc->alu.src3_sel = src3->type != IR2_SRC_CONST; 293 } 294 295 bc->alu.pred_select = instr_v->pred; 296 } 297 298 if (instr_s) { 299 struct ir2_src *src = instr_s->src; 300 301 bc->alu.scalar_opc = instr_s->alu.scalar_opc; 302 bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s); 303 bc->alu.scalar_dest = dst_to_reg(ctx, instr_s); 304 bc->alu.scalar_clamp = instr_s->alu.saturate; 305 bc->alu.export_data = instr_s->alu.export >= 0; 306 307 if (instr_s->src_count == 1) { 308 bc->alu.src3_reg_byte = src_reg_byte(ctx, src); 309 bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src); 310 bc->alu.src3_reg_negate = src->negate; 311 bc->alu.src3_sel = src->type != IR2_SRC_CONST; 312 } else { 313 assert(instr_s->src_count == 2); 314 315 bc->alu.src3_reg_byte = src_reg_byte(ctx, src); 316 bc->alu.src3_swiz = alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle); 317 bc->alu.src3_reg_negate = src->negate; 318 bc->alu.src3_sel = src->type != IR2_SRC_CONST;; 319 } 320 321 if (instr_v) 322 assert(instr_s->pred == instr_v->pred); 323 bc->alu.pred_select = instr_s->pred; 324 } 325 326 *is_fetch = false; 327 return; 328} 329 330static unsigned 331write_cfs(struct ir2_context *ctx, instr_cf_t * cfs, unsigned cf_idx, 332 instr_cf_alloc_t *alloc, instr_cf_exec_t *exec) 333{ 334 assert(exec->count); 335 336 if (alloc) 337 cfs[cf_idx++].alloc = *alloc; 338 339 /* for memory alloc offset for patching */ 340 if (alloc && alloc->buffer_select == SQ_MEMORY && 341 ctx->info->mem_export_ptr == -1) 342 ctx->info->mem_export_ptr = cf_idx / 2 * 3; 343 344 cfs[cf_idx++].exec = *exec; 345 exec->address += exec->count; 346 exec->serialize = 0; 347 exec->count = 0; 348 349 return cf_idx; 350} 351 352/* assemble the final shader */ 353void assemble(struct ir2_context *ctx, bool binning) 354{ 355 /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384) 356 * address is 9 bits so could it be 512 ? 357 */ 358 instr_cf_t cfs[384]; 359 instr_t bytecode[384], bc; 360 unsigned block_addr[128]; 361 unsigned num_cf = 0; 362 363 /* CF instr state */ 364 instr_cf_exec_t exec = {.opc = EXEC}; 365 instr_cf_alloc_t alloc = {.opc = ALLOC}; 366 367 int sync_id, sync_id_prev = -1; 368 bool is_fetch = false; 369 bool need_sync = true; 370 bool need_alloc = false; 371 unsigned block_idx = 0; 372 373 ctx->info->mem_export_ptr = -1; 374 ctx->info->num_fetch_instrs = 0; 375 376 /* vertex shader always needs to allocate at least one parameter 377 * if it will never happen, 378 */ 379 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) { 380 alloc.buffer_select = SQ_PARAMETER_PIXEL; 381 cfs[num_cf++].alloc = alloc; 382 } 383 384 block_addr[0] = 0; 385 386 for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) { 387 struct ir2_instr *instr = ctx->instr_sched[j].instr; 388 389 /* catch IR2_CF since it isn't a regular instruction */ 390 if (instr && instr->type == IR2_CF) { 391 assert(!need_alloc); /* XXX */ 392 393 /* flush any exec cf before inserting jmp */ 394 if (exec.count) 395 num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec); 396 397 cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t) { 398 .opc = COND_JMP, 399 .address = instr->cf.block_idx, /* will be fixed later */ 400 .force_call = !instr->pred, 401 .predicated_jmp = 1, 402 .direction = instr->cf.block_idx > instr->block_idx, 403 .condition = instr->pred & 1, 404 }; 405 continue; 406 } 407 408 /* fill the 3 dwords for the instruction */ 409 fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch); 410 411 /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */ 412 sync_id = 0; 413 if (is_fetch) 414 sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2; 415 416 need_sync = sync_id != sync_id_prev; 417 sync_id_prev = sync_id; 418 419 unsigned block; 420 { 421 422 if (ctx->instr_sched[j].instr) 423 block = ctx->instr_sched[j].instr->block_idx; 424 else 425 block = ctx->instr_sched[j].instr_s->block_idx; 426 427 assert(block_idx <= block); 428 } 429 430 /* info for patching */ 431 if (is_fetch) { 432 struct ir2_fetch_info *info = 433 &ctx->info->fetch_info[ctx->info->num_fetch_instrs++]; 434 info->offset = i * 3; /* add cf offset later */ 435 436 if (bc.fetch.opc == VTX_FETCH) { 437 info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz; 438 } else if (bc.fetch.opc == TEX_FETCH) { 439 info->tex.samp_id = instr->fetch.tex.samp_id; 440 info->tex.src_swiz = bc.fetch.tex.src_swiz; 441 } else { 442 ctx->info->num_fetch_instrs--; 443 } 444 } 445 446 /* exec cf after 6 instr or when switching between fetch / alu */ 447 if (exec.count == 6 || (exec.count && (need_sync || block != block_idx))) { 448 num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); 449 need_alloc = false; 450 } 451 452 /* update block_addrs for jmp patching */ 453 while (block_idx < block) 454 block_addr[++block_idx] = num_cf; 455 456 /* export - fill alloc cf */ 457 if (!is_fetch && bc.alu.export_data) { 458 /* get the export buffer from either vector/scalar dest */ 459 instr_alloc_type_t buffer = 460 export_buf(bc.alu.vector_dest); 461 if (bc.alu.scalar_write_mask) { 462 if (bc.alu.vector_write_mask) 463 assert(buffer == export_buf(bc.alu.scalar_dest)); 464 buffer = export_buf(bc.alu.scalar_dest); 465 } 466 467 /* flush previous alloc if the buffer changes */ 468 bool need_new_alloc = buffer != alloc.buffer_select; 469 470 /* memory export always in 32/33 pair, new alloc on 32 */ 471 if (bc.alu.vector_dest == 32) 472 need_new_alloc = true; 473 474 if (need_new_alloc && exec.count) { 475 num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); 476 need_alloc = false; 477 } 478 479 need_alloc |= need_new_alloc; 480 481 alloc.size = 0; 482 alloc.buffer_select = buffer; 483 484 if (buffer == SQ_PARAMETER_PIXEL && ctx->so->type == MESA_SHADER_VERTEX) 485 alloc.size = ctx->f->inputs_count - 1; 486 487 if (buffer == SQ_POSITION) 488 alloc.size = ctx->so->writes_psize; 489 } 490 491 if (is_fetch) 492 exec.serialize |= 0x1 << exec.count * 2; 493 if (need_sync) 494 exec.serialize |= 0x2 << exec.count * 2; 495 496 need_sync = false; 497 exec.count += 1; 498 bytecode[i++] = bc; 499 } 500 501 /* final exec cf */ 502 exec.opc = EXEC_END; 503 num_cf = 504 write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); 505 506 /* insert nop to get an even # of CFs */ 507 if (num_cf % 2) 508 cfs[num_cf++] = (instr_cf_t) { 509 .opc = NOP}; 510 511 /* patch cf addrs */ 512 for (int idx = 0; idx < num_cf; idx++) { 513 switch (cfs[idx].opc) { 514 case NOP: 515 case ALLOC: 516 break; 517 case EXEC: 518 case EXEC_END: 519 cfs[idx].exec.address += num_cf / 2; 520 break; 521 case COND_JMP: 522 cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address]; 523 break; 524 default: 525 assert(0); 526 } 527 } 528 529 /* concatenate cfs and alu/fetch */ 530 uint32_t cfdwords = num_cf / 2 * 3; 531 uint32_t alufetchdwords = exec.address * 3; 532 uint32_t sizedwords = cfdwords + alufetchdwords; 533 uint32_t *dwords = malloc(sizedwords * 4); 534 assert(dwords); 535 memcpy(dwords, cfs, cfdwords * 4); 536 memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4); 537 538 /* finalize ir2_shader_info */ 539 ctx->info->dwords = dwords; 540 ctx->info->sizedwords = sizedwords; 541 for (int i = 0; i < ctx->info->num_fetch_instrs; i++) 542 ctx->info->fetch_info[i].offset += cfdwords; 543 544 if (fd_mesa_debug & FD_DBG_DISASM) { 545 DBG("disassemble: type=%d", ctx->so->type); 546 disasm_a2xx(dwords, sizedwords, 0, ctx->so->type); 547 } 548} 549