17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2020 Valve Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg */ 247ec681f3Smrg#include "helpers.h" 257ec681f3Smrg 267ec681f3Smrgusing namespace aco; 277ec681f3Smrg 287ec681f3SmrgBEGIN_TEST(optimize.neg) 297ec681f3Smrg for (unsigned i = GFX9; i <= GFX10; i++) { 307ec681f3Smrg //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm 317ec681f3Smrg if (!setup_cs("v1 v1 s1 s1", (chip_class)i)) 327ec681f3Smrg continue; 337ec681f3Smrg 347ec681f3Smrg //! v1: %res0 = v_mul_f32 %a, -%b 357ec681f3Smrg //! p_unit_test 0, %res0 367ec681f3Smrg Temp neg_b = fneg(inputs[1]); 377ec681f3Smrg writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b)); 387ec681f3Smrg 397ec681f3Smrg //~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a 407ec681f3Smrg //~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a 417ec681f3Smrg //~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a 427ec681f3Smrg //! p_unit_test 1, %res1 437ec681f3Smrg Temp neg_a = fneg(inputs[0]); 447ec681f3Smrg writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a)); 457ec681f3Smrg 467ec681f3Smrg //! v1: %res2 = v_mul_f32 %a, %b 477ec681f3Smrg //! p_unit_test 2, %res2 487ec681f3Smrg Temp neg_neg_a = fneg(neg_a); 497ec681f3Smrg writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1])); 507ec681f3Smrg 517ec681f3Smrg //! v1: %res3 = v_mul_f32 |%a|, %b 527ec681f3Smrg //! p_unit_test 3, %res3 537ec681f3Smrg Temp abs_neg_a = fabs(neg_a); 547ec681f3Smrg writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1])); 557ec681f3Smrg 567ec681f3Smrg //! v1: %res4 = v_mul_f32 -|%a|, %b 577ec681f3Smrg //! p_unit_test 4, %res4 587ec681f3Smrg Temp abs_a = fabs(inputs[0]); 597ec681f3Smrg Temp neg_abs_a = fneg(abs_a); 607ec681f3Smrg writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1])); 617ec681f3Smrg 627ec681f3Smrg //! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 637ec681f3Smrg //! p_unit_test 5, %res5 647ec681f3Smrg writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1))); 657ec681f3Smrg 667ec681f3Smrg //! v1: %res6 = v_subrev_f32 %a, %b 677ec681f3Smrg //! p_unit_test 6, %res6 687ec681f3Smrg writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1])); 697ec681f3Smrg 707ec681f3Smrg //! v1: %res7 = v_sub_f32 %b, %a 717ec681f3Smrg //! p_unit_test 7, %res7 727ec681f3Smrg writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a)); 737ec681f3Smrg 747ec681f3Smrg //! v1: %res8 = v_mul_f32 %a, -%c 757ec681f3Smrg //! p_unit_test 8, %res8 767ec681f3Smrg Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2])); 777ec681f3Smrg writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c)); 787ec681f3Smrg 797ec681f3Smrg // //! v1: %res9 = v_mul_f32 |%neg_a|, %b 807ec681f3Smrg // //! p_unit_test 9, %res9 817ec681f3Smrg Temp abs_neg_abs_a = fabs(neg_abs_a); 827ec681f3Smrg writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1])); 837ec681f3Smrg 847ec681f3Smrg finish_opt_test(); 857ec681f3Smrg } 867ec681f3SmrgEND_TEST 877ec681f3Smrg 887ec681f3SmrgBEGIN_TEST(optimize.output_modifiers) 897ec681f3Smrg //>> v1: %a, v1: %b = p_startpgm 907ec681f3Smrg if (!setup_cs("v1 v1", GFX9)) 917ec681f3Smrg return; 927ec681f3Smrg 937ec681f3Smrg program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; 947ec681f3Smrg 957ec681f3Smrg /* 32-bit modifiers */ 967ec681f3Smrg 977ec681f3Smrg //! v1: %res0 = v_add_f32 %a, %b *0.5 987ec681f3Smrg //! p_unit_test 0, %res0 997ec681f3Smrg Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 1007ec681f3Smrg writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp)); 1017ec681f3Smrg 1027ec681f3Smrg //! v1: %res1 = v_add_f32 %a, %b *2 1037ec681f3Smrg //! p_unit_test 1, %res1 1047ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 1057ec681f3Smrg writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp)); 1067ec681f3Smrg 1077ec681f3Smrg //! v1: %res2 = v_add_f32 %a, %b *4 1087ec681f3Smrg //! p_unit_test 2, %res2 1097ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 1107ec681f3Smrg writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp)); 1117ec681f3Smrg 1127ec681f3Smrg //! v1: %res3 = v_add_f32 %a, %b clamp 1137ec681f3Smrg //! p_unit_test 3, %res3 1147ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 1157ec681f3Smrg writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), 1167ec681f3Smrg Operand::c32(0x3f800000u), tmp)); 1177ec681f3Smrg 1187ec681f3Smrg //! v1: %res4 = v_add_f32 %a, %b *2 clamp 1197ec681f3Smrg //! p_unit_test 4, %res4 1207ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 1217ec681f3Smrg tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp); 1227ec681f3Smrg writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), 1237ec681f3Smrg Operand::c32(0x3f800000u), tmp)); 1247ec681f3Smrg 1257ec681f3Smrg /* 16-bit modifiers */ 1267ec681f3Smrg 1277ec681f3Smrg //! v2b: %res5 = v_add_f16 %a, %b *0.5 1287ec681f3Smrg //! p_unit_test 5, %res5 1297ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]); 1307ec681f3Smrg writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp)); 1317ec681f3Smrg 1327ec681f3Smrg //! v2b: %res6 = v_add_f16 %a, %b *2 1337ec681f3Smrg //! p_unit_test 6, %res6 1347ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]); 1357ec681f3Smrg writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp)); 1367ec681f3Smrg 1377ec681f3Smrg //! v2b: %res7 = v_add_f16 %a, %b *4 1387ec681f3Smrg //! p_unit_test 7, %res7 1397ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]); 1407ec681f3Smrg writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp)); 1417ec681f3Smrg 1427ec681f3Smrg //! v2b: %res8 = v_add_f16 %a, %b clamp 1437ec681f3Smrg //! p_unit_test 8, %res8 1447ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]); 1457ec681f3Smrg writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u), 1467ec681f3Smrg Operand::c16(0x3c00u), tmp)); 1477ec681f3Smrg 1487ec681f3Smrg //! v2b: %res9 = v_add_f16 %a, %b *2 clamp 1497ec681f3Smrg //! p_unit_test 9, %res9 1507ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]); 1517ec681f3Smrg tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp); 1527ec681f3Smrg writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u), 1537ec681f3Smrg Operand::c16(0x3c00u), tmp)); 1547ec681f3Smrg 1557ec681f3Smrg /* clamping is done after omod */ 1567ec681f3Smrg 1577ec681f3Smrg //! v1: %res10_tmp = v_add_f32 %a, %b clamp 1587ec681f3Smrg //! v1: %res10 = v_mul_f32 2.0, %res10_tmp 1597ec681f3Smrg //! p_unit_test 10, %res10 1607ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 1617ec681f3Smrg tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u), 1627ec681f3Smrg tmp); 1637ec681f3Smrg writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp)); 1647ec681f3Smrg 1657ec681f3Smrg /* unsupported instructions */ 1667ec681f3Smrg 1677ec681f3Smrg //! v1: %res11_tmp = v_xor_b32 %a, %b 1687ec681f3Smrg //! v1: %res11 = v_mul_f32 2.0, %res11_tmp 1697ec681f3Smrg //! p_unit_test 11, %res11 1707ec681f3Smrg tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]); 1717ec681f3Smrg writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp)); 1727ec681f3Smrg 1737ec681f3Smrg /* several users */ 1747ec681f3Smrg 1757ec681f3Smrg //! v1: %res12_tmp = v_add_f32 %a, %b 1767ec681f3Smrg //! p_unit_test %res12_tmp 1777ec681f3Smrg //! v1: %res12 = v_mul_f32 2.0, %res12_tmp 1787ec681f3Smrg //! p_unit_test 12, %res12 1797ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 1807ec681f3Smrg bld.pseudo(aco_opcode::p_unit_test, tmp); 1817ec681f3Smrg writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp)); 1827ec681f3Smrg 1837ec681f3Smrg //! v1: %res13 = v_add_f32 %a, %b 1847ec681f3Smrg //! p_unit_test 13, %res13 1857ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 1867ec681f3Smrg bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp); 1877ec681f3Smrg writeout(13, tmp); 1887ec681f3Smrg 1897ec681f3Smrg /* omod has no effect if denormals are enabled but clamp is fine */ 1907ec681f3Smrg 1917ec681f3Smrg //>> BB1 1927ec681f3Smrg //! /* logical preds: / linear preds: / kind: uniform, */ 1937ec681f3Smrg program->next_fp_mode.denorm32 = fp_denorm_keep; 1947ec681f3Smrg program->next_fp_mode.denorm16_64 = fp_denorm_flush; 1957ec681f3Smrg bld.reset(program->create_and_insert_block()); 1967ec681f3Smrg 1977ec681f3Smrg //! v1: %res14_tmp = v_add_f32 %a, %b 1987ec681f3Smrg //! v1: %res14 = v_mul_f32 2.0, %res13_tmp 1997ec681f3Smrg //! p_unit_test 14, %res14 2007ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 2017ec681f3Smrg writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp)); 2027ec681f3Smrg 2037ec681f3Smrg //! v1: %res15 = v_add_f32 %a, %b clamp 2047ec681f3Smrg //! p_unit_test 15, %res15 2057ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 2067ec681f3Smrg writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), 2077ec681f3Smrg Operand::c32(0x3f800000u), tmp)); 2087ec681f3Smrg 2097ec681f3Smrg //>> BB2 2107ec681f3Smrg //! /* logical preds: / linear preds: / kind: uniform, */ 2117ec681f3Smrg program->next_fp_mode.denorm32 = fp_denorm_flush; 2127ec681f3Smrg program->next_fp_mode.denorm16_64 = fp_denorm_keep; 2137ec681f3Smrg bld.reset(program->create_and_insert_block()); 2147ec681f3Smrg 2157ec681f3Smrg //! v2b: %res16_tmp = v_add_f16 %a, %b 2167ec681f3Smrg //! v2b: %res16 = v_mul_f16 2.0, %res15_tmp 2177ec681f3Smrg //! p_unit_test 16, %res16 2187ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]); 2197ec681f3Smrg writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp)); 2207ec681f3Smrg 2217ec681f3Smrg //! v2b: %res17 = v_add_f16 %a, %b clamp 2227ec681f3Smrg //! p_unit_test 17, %res17 2237ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]); 2247ec681f3Smrg writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u), 2257ec681f3Smrg Operand::c16(0x3c00u), tmp)); 2267ec681f3Smrg 2277ec681f3Smrg /* omod flushes -0.0 to +0.0 */ 2287ec681f3Smrg 2297ec681f3Smrg //>> BB3 2307ec681f3Smrg //! /* logical preds: / linear preds: / kind: uniform, */ 2317ec681f3Smrg program->next_fp_mode.denorm32 = fp_denorm_keep; 2327ec681f3Smrg program->next_fp_mode.denorm16_64 = fp_denorm_keep; 2337ec681f3Smrg program->next_fp_mode.preserve_signed_zero_inf_nan32 = true; 2347ec681f3Smrg program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false; 2357ec681f3Smrg bld.reset(program->create_and_insert_block()); 2367ec681f3Smrg 2377ec681f3Smrg //! v1: %res18_tmp = v_add_f32 %a, %b 2387ec681f3Smrg //! v1: %res18 = v_mul_f32 2.0, %res18_tmp 2397ec681f3Smrg //! p_unit_test 18, %res18 2407ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 2417ec681f3Smrg writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp)); 2427ec681f3Smrg //! v1: %res19 = v_add_f32 %a, %b clamp 2437ec681f3Smrg //! p_unit_test 19, %res19 2447ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]); 2457ec681f3Smrg writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), 2467ec681f3Smrg Operand::c32(0x3f800000u), tmp)); 2477ec681f3Smrg 2487ec681f3Smrg //>> BB4 2497ec681f3Smrg //! /* logical preds: / linear preds: / kind: uniform, */ 2507ec681f3Smrg program->next_fp_mode.preserve_signed_zero_inf_nan32 = false; 2517ec681f3Smrg program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true; 2527ec681f3Smrg bld.reset(program->create_and_insert_block()); 2537ec681f3Smrg //! v2b: %res20_tmp = v_add_f16 %a, %b 2547ec681f3Smrg //! v2b: %res20 = v_mul_f16 2.0, %res20_tmp 2557ec681f3Smrg //! p_unit_test 20, %res20 2567ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]); 2577ec681f3Smrg writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp)); 2587ec681f3Smrg //! v2b: %res21 = v_add_f16 %a, %b clamp 2597ec681f3Smrg //! p_unit_test 21, %res21 2607ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]); 2617ec681f3Smrg writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u), 2627ec681f3Smrg Operand::c16(0x3c00u), tmp)); 2637ec681f3Smrg 2647ec681f3Smrg finish_opt_test(); 2657ec681f3SmrgEND_TEST 2667ec681f3Smrg 2677ec681f3SmrgTemp create_subbrev_co(Operand op0, Operand op1, Operand op2) 2687ec681f3Smrg{ 2697ec681f3Smrg return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2); 2707ec681f3Smrg} 2717ec681f3Smrg 2727ec681f3SmrgBEGIN_TEST(optimize.cndmask) 2737ec681f3Smrg for (unsigned i = GFX9; i <= GFX10; i++) { 2747ec681f3Smrg //>> v1: %a, s1: %b, s2: %c = p_startpgm 2757ec681f3Smrg if (!setup_cs("v1 s1 s2", (chip_class)i)) 2767ec681f3Smrg continue; 2777ec681f3Smrg 2787ec681f3Smrg Temp subbrev; 2797ec681f3Smrg 2807ec681f3Smrg //! v1: %res0 = v_cndmask_b32 0, %a, %c 2817ec681f3Smrg //! p_unit_test 0, %res0 2827ec681f3Smrg subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2])); 2837ec681f3Smrg writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev)); 2847ec681f3Smrg 2857ec681f3Smrg //! v1: %res1 = v_cndmask_b32 0, 42, %c 2867ec681f3Smrg //! p_unit_test 1, %res1 2877ec681f3Smrg subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2])); 2887ec681f3Smrg writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev)); 2897ec681f3Smrg 2907ec681f3Smrg //~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c 2917ec681f3Smrg //~gfx9! v1: %res2 = v_and_b32 %b, %subbrev 2927ec681f3Smrg //~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c 2937ec681f3Smrg //! p_unit_test 2, %res2 2947ec681f3Smrg subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2])); 2957ec681f3Smrg writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev)); 2967ec681f3Smrg 2977ec681f3Smrg //! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c 2987ec681f3Smrg //! v1: %xor = v_xor_b32 %a, %subbrev1 2997ec681f3Smrg //! v1: %res3 = v_cndmask_b32 0, %xor, %c 3007ec681f3Smrg //! p_unit_test 3, %res3 3017ec681f3Smrg subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2])); 3027ec681f3Smrg Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev); 3037ec681f3Smrg writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev)); 3047ec681f3Smrg 3057ec681f3Smrg //! v1: %res4 = v_cndmask_b32 0, %a, %c 3067ec681f3Smrg //! p_unit_test 4, %res4 3077ec681f3Smrg Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), 3087ec681f3Smrg Operand::c32(1u), Operand(inputs[2])); 3097ec681f3Smrg Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask); 3107ec681f3Smrg writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub)); 3117ec681f3Smrg 3127ec681f3Smrg finish_opt_test(); 3137ec681f3Smrg } 3147ec681f3SmrgEND_TEST 3157ec681f3Smrg 3167ec681f3SmrgBEGIN_TEST(optimize.add_lshl) 3177ec681f3Smrg for (unsigned i = GFX8; i <= GFX10; i++) { 3187ec681f3Smrg //>> s1: %a, v1: %b = p_startpgm 3197ec681f3Smrg if (!setup_cs("s1 v1", (chip_class)i)) 3207ec681f3Smrg continue; 3217ec681f3Smrg 3227ec681f3Smrg Temp shift; 3237ec681f3Smrg 3247ec681f3Smrg //~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3 3257ec681f3Smrg //~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4 3267ec681f3Smrg //~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4 3277ec681f3Smrg //! p_unit_test 0, %res0 3287ec681f3Smrg shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]), 3297ec681f3Smrg Operand::c32(3u)); 3307ec681f3Smrg writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, 3317ec681f3Smrg Operand::c32(4u))); 3327ec681f3Smrg 3337ec681f3Smrg //~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3 3347ec681f3Smrg //~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4 3357ec681f3Smrg //~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b 3367ec681f3Smrg //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1 3377ec681f3Smrg //~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4 3387ec681f3Smrg //~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b 3397ec681f3Smrg //~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add 3407ec681f3Smrg //! p_unit_test 1, %res1 3417ec681f3Smrg shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]), 3427ec681f3Smrg Operand::c32(3u)); 3437ec681f3Smrg Temp sadd = 3447ec681f3Smrg bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u)); 3457ec681f3Smrg Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1])); 3467ec681f3Smrg writeout(1, bld.vadd32(bld.def(v1), sadd, vadd)); 3477ec681f3Smrg 3487ec681f3Smrg //~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3 3497ec681f3Smrg //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b 3507ec681f3Smrg //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b 3517ec681f3Smrg //! p_unit_test 2, %res2 3527ec681f3Smrg Temp lshl = 3537ec681f3Smrg bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand::c32(3u)); 3547ec681f3Smrg writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 3557ec681f3Smrg 3567ec681f3Smrg //~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7 3577ec681f3Smrg //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b 3587ec681f3Smrg //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b 3597ec681f3Smrg //! p_unit_test 3, %res3 3607ec681f3Smrg Operand a_24bit = Operand(inputs[0]); 3617ec681f3Smrg a_24bit.set24bit(true); 3627ec681f3Smrg lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(7u)); 3637ec681f3Smrg writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 3647ec681f3Smrg 3657ec681f3Smrg //! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3 3667ec681f3Smrg //~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b 3677ec681f3Smrg //~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b 3687ec681f3Smrg //! p_unit_test 4, %carry 3697ec681f3Smrg lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u)); 3707ec681f3Smrg Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp(); 3717ec681f3Smrg writeout(4, carry); 3727ec681f3Smrg 3737ec681f3Smrg //~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a 3747ec681f3Smrg //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b 3757ec681f3Smrg //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b 3767ec681f3Smrg //! p_unit_test 5, %res5 3777ec681f3Smrg lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit); 3787ec681f3Smrg writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 3797ec681f3Smrg 3807ec681f3Smrg //~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b 3817ec681f3Smrg //~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b 3827ec681f3Smrg //! p_unit_test 6, %res6 3837ec681f3Smrg lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u)); 3847ec681f3Smrg writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 3857ec681f3Smrg 3867ec681f3Smrg //~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b 3877ec681f3Smrg //~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b 3887ec681f3Smrg //! p_unit_test 7, %res7 3897ec681f3Smrg Operand a_16bit = Operand(inputs[0]); 3907ec681f3Smrg a_16bit.set16bit(true); 3917ec681f3Smrg lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand::c32(4u)); 3927ec681f3Smrg writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 3937ec681f3Smrg 3947ec681f3Smrg finish_opt_test(); 3957ec681f3Smrg } 3967ec681f3SmrgEND_TEST 3977ec681f3Smrg 3987ec681f3SmrgBEGIN_TEST(optimize.bcnt) 3997ec681f3Smrg for (unsigned i = GFX8; i <= GFX10; i++) { 4007ec681f3Smrg //>> v1: %a, s1: %b = p_startpgm 4017ec681f3Smrg if (!setup_cs("v1 s1", (chip_class)i)) 4027ec681f3Smrg continue; 4037ec681f3Smrg 4047ec681f3Smrg Temp bcnt; 4057ec681f3Smrg 4067ec681f3Smrg //! v1: %res0 = v_bcnt_u32_b32 %a, %a 4077ec681f3Smrg //! p_unit_test 0, %res0 4087ec681f3Smrg bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero()); 4097ec681f3Smrg writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]))); 4107ec681f3Smrg 4117ec681f3Smrg //! v1: %res1 = v_bcnt_u32_b32 %a, %b 4127ec681f3Smrg //! p_unit_test 1, %res1 4137ec681f3Smrg bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero()); 4147ec681f3Smrg writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1]))); 4157ec681f3Smrg 4167ec681f3Smrg //! v1: %res2 = v_bcnt_u32_b32 %a, 42 4177ec681f3Smrg //! p_unit_test 2, %res2 4187ec681f3Smrg bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero()); 4197ec681f3Smrg writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u))); 4207ec681f3Smrg 4217ec681f3Smrg //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0 4227ec681f3Smrg //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a 4237ec681f3Smrg //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a 4247ec681f3Smrg //! p_unit_test 3, %res3 4257ec681f3Smrg bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero()); 4267ec681f3Smrg writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]))); 4277ec681f3Smrg 4287ec681f3Smrg //! v1: %bnct4 = v_bcnt_u32_b32 %a, 0 4297ec681f3Smrg //~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a 4307ec681f3Smrg //~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a 4317ec681f3Smrg //! p_unit_test 4, %carry 4327ec681f3Smrg bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero()); 4337ec681f3Smrg Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp(); 4347ec681f3Smrg writeout(4, carry); 4357ec681f3Smrg 4367ec681f3Smrg finish_opt_test(); 4377ec681f3Smrg } 4387ec681f3SmrgEND_TEST 4397ec681f3Smrg 4407ec681f3Smrgstruct clamp_config { 4417ec681f3Smrg const char *name; 4427ec681f3Smrg aco_opcode min, max, med3; 4437ec681f3Smrg Operand lb, ub; 4447ec681f3Smrg}; 4457ec681f3Smrg 4467ec681f3Smrgstatic const clamp_config clamp_configs[] = { 4477ec681f3Smrg /* 0.0, 4.0 */ 4487ec681f3Smrg {"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32, 4497ec681f3Smrg Operand::zero(), Operand::c32(0x40800000u)}, 4507ec681f3Smrg {"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16, 4517ec681f3Smrg Operand::c16(0u), Operand::c16(0x4400)}, 4527ec681f3Smrg /* -1.0, 0.0 */ 4537ec681f3Smrg {"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32, 4547ec681f3Smrg Operand::c32(0xbf800000u), Operand::zero()}, 4557ec681f3Smrg {"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16, 4567ec681f3Smrg Operand::c16(0xBC00), Operand::c16(0u)}, 4577ec681f3Smrg /* 0, 3 */ 4587ec681f3Smrg {"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32, 4597ec681f3Smrg Operand::zero(), Operand::c32(3u)}, 4607ec681f3Smrg {"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16, 4617ec681f3Smrg Operand::c16(0u), Operand::c16(3u)}, 4627ec681f3Smrg {"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32, 4637ec681f3Smrg Operand::zero(), Operand::c32(3u)}, 4647ec681f3Smrg {"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16, 4657ec681f3Smrg Operand::c16(0u), Operand::c16(3u)}, 4667ec681f3Smrg /* -5, 0 */ 4677ec681f3Smrg {"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32, 4687ec681f3Smrg Operand::c32(0xfffffffbu), Operand::zero()}, 4697ec681f3Smrg {"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16, 4707ec681f3Smrg Operand::c16(0xfffbu), Operand::c16(0u)}, 4717ec681f3Smrg}; 4727ec681f3Smrg 4737ec681f3SmrgBEGIN_TEST(optimize.clamp) 4747ec681f3Smrg for (clamp_config cfg : clamp_configs) { 4757ec681f3Smrg if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name)) 4767ec681f3Smrg continue; 4777ec681f3Smrg 4787ec681f3Smrg //! cfg: @match_func(min max med3 lb ub) 4797ec681f3Smrg fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]); 4807ec681f3Smrg fprintf(output, "%s ", instr_info.name[(int)cfg.max]); 4817ec681f3Smrg fprintf(output, "%s ", instr_info.name[(int)cfg.med3]); 4827ec681f3Smrg aco_print_operand(&cfg.lb, output); 4837ec681f3Smrg fprintf(output, " "); 4847ec681f3Smrg aco_print_operand(&cfg.ub, output); 4857ec681f3Smrg fprintf(output, "\n"); 4867ec681f3Smrg 4877ec681f3Smrg //>> v1: %a, v1: %b, v1: %c = p_startpgm 4887ec681f3Smrg 4897ec681f3Smrg //! v1: %res0 = @med3 @ub, @lb, %a 4907ec681f3Smrg //! p_unit_test 0, %res0 4917ec681f3Smrg writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub, 4927ec681f3Smrg bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]))); 4937ec681f3Smrg 4947ec681f3Smrg //! v1: %res1 = @med3 @lb, @ub, %a 4957ec681f3Smrg //! p_unit_test 1, %res1 4967ec681f3Smrg writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb, 4977ec681f3Smrg bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]))); 4987ec681f3Smrg 4997ec681f3Smrg /* min constant must be greater than max constant */ 5007ec681f3Smrg //! v1: %res2_tmp = @min @lb, %a 5017ec681f3Smrg //! v1: %res2 = @max @ub, %res2_tmp 5027ec681f3Smrg //! p_unit_test 2, %res2 5037ec681f3Smrg writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub, 5047ec681f3Smrg bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0]))); 5057ec681f3Smrg 5067ec681f3Smrg //! v1: %res3_tmp = @max @ub, %a 5077ec681f3Smrg //! v1: %res3 = @min @lb, %res3_tmp 5087ec681f3Smrg //! p_unit_test 3, %res3 5097ec681f3Smrg writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb, 5107ec681f3Smrg bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0]))); 5117ec681f3Smrg 5127ec681f3Smrg /* needs two constants */ 5137ec681f3Smrg 5147ec681f3Smrg //! v1: %res4_tmp = @max @lb, %a 5157ec681f3Smrg //! v1: %res4 = @min %b, %res4_tmp 5167ec681f3Smrg //! p_unit_test 4, %res4 5177ec681f3Smrg writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1], 5187ec681f3Smrg bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]))); 5197ec681f3Smrg 5207ec681f3Smrg //! v1: %res5_tmp = @max %b, %a 5217ec681f3Smrg //! v1: %res5 = @min @ub, %res5_tmp 5227ec681f3Smrg //! p_unit_test 5, %res5 5237ec681f3Smrg writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub, 5247ec681f3Smrg bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0]))); 5257ec681f3Smrg 5267ec681f3Smrg //! v1: %res6_tmp = @max %c, %a 5277ec681f3Smrg //! v1: %res6 = @min %b, %res6_tmp 5287ec681f3Smrg //! p_unit_test 6, %res6 5297ec681f3Smrg writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1], 5307ec681f3Smrg bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0]))); 5317ec681f3Smrg 5327ec681f3Smrg /* correct NaN behaviour with precise */ 5337ec681f3Smrg 5347ec681f3Smrg //! v1: %res7 = @med3 @ub, @lb, %a 5357ec681f3Smrg //! p_unit_test 7, %res7 5367ec681f3Smrg Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]); 5377ec681f3Smrg max.def(0).setPrecise(true); 5387ec681f3Smrg Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max); 5397ec681f3Smrg max.def(0).setPrecise(true); 5407ec681f3Smrg writeout(7, min); 5417ec681f3Smrg 5427ec681f3Smrg //! v1: (precise)%res8_tmp = @min @ub, %a 5437ec681f3Smrg //! v1: %res8 = @max @lb, %res8_tmp 5447ec681f3Smrg //! p_unit_test 8, %res8 5457ec681f3Smrg min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]); 5467ec681f3Smrg min.def(0).setPrecise(true); 5477ec681f3Smrg writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min)); 5487ec681f3Smrg 5497ec681f3Smrg finish_opt_test(); 5507ec681f3Smrg } 5517ec681f3SmrgEND_TEST 5527ec681f3Smrg 5537ec681f3SmrgBEGIN_TEST(optimize.const_comparison_ordering) 5547ec681f3Smrg //>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm 5557ec681f3Smrg if (!setup_cs("v1 v1 v2 v1", GFX9)) 5567ec681f3Smrg return; 5577ec681f3Smrg 5587ec681f3Smrg /* optimize to unordered comparison */ 5597ec681f3Smrg //! s2: %res0 = v_cmp_nge_f32 4.0, %a 5607ec681f3Smrg //! p_unit_test 0, %res0 5617ec681f3Smrg writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), 5627ec681f3Smrg bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]), 5637ec681f3Smrg bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), 5647ec681f3Smrg Operand::c32(0x40800000u), inputs[0]))); 5657ec681f3Smrg 5667ec681f3Smrg //! s2: %res1 = v_cmp_nge_f32 4.0, %a 5677ec681f3Smrg //! p_unit_test 1, %res1 5687ec681f3Smrg writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), 5697ec681f3Smrg bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]), 5707ec681f3Smrg bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), 5717ec681f3Smrg Operand::c32(0x40800000u), inputs[0]))); 5727ec681f3Smrg 5737ec681f3Smrg //! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a 5747ec681f3Smrg //! p_unit_test 2, %res2 5757ec681f3Smrg writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), 5767ec681f3Smrg bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]), 5777ec681f3Smrg bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), 5787ec681f3Smrg bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0]))); 5797ec681f3Smrg 5807ec681f3Smrg /* optimize to ordered comparison */ 5817ec681f3Smrg //! s2: %res3 = v_cmp_lt_f32 4.0, %a 5827ec681f3Smrg //! p_unit_test 3, %res3 5837ec681f3Smrg writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), 5847ec681f3Smrg bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]), 5857ec681f3Smrg bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), 5867ec681f3Smrg Operand::c32(0x40800000u), inputs[0]))); 5877ec681f3Smrg 5887ec681f3Smrg //! s2: %res4 = v_cmp_lt_f32 4.0, %a 5897ec681f3Smrg //! p_unit_test 4, %res4 5907ec681f3Smrg writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), 5917ec681f3Smrg bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]), 5927ec681f3Smrg bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), 5937ec681f3Smrg Operand::c32(0x40800000u), inputs[0]))); 5947ec681f3Smrg 5957ec681f3Smrg //! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a 5967ec681f3Smrg //! p_unit_test 5, %res5 5977ec681f3Smrg writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), 5987ec681f3Smrg bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]), 5997ec681f3Smrg bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), 6007ec681f3Smrg bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0]))); 6017ec681f3Smrg 6027ec681f3Smrg /* similar but unoptimizable expressions */ 6037ec681f3Smrg //! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a 6047ec681f3Smrg //! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a 6057ec681f3Smrg //! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0 6067ec681f3Smrg //! p_unit_test 6, %res6 6077ec681f3Smrg Temp src1 = 6087ec681f3Smrg bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]); 6097ec681f3Smrg Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]); 6107ec681f3Smrg writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1)); 6117ec681f3Smrg 6127ec681f3Smrg //! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a 6137ec681f3Smrg //! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a 6147ec681f3Smrg //! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0 6157ec681f3Smrg //! p_unit_test 7, %res7 6167ec681f3Smrg src1 = 6177ec681f3Smrg bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]); 6187ec681f3Smrg src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]); 6197ec681f3Smrg writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1)); 6207ec681f3Smrg 6217ec681f3Smrg //! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d 6227ec681f3Smrg //! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a 6237ec681f3Smrg //! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0 6247ec681f3Smrg //! p_unit_test 8, %res8 6257ec681f3Smrg src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[3]); 6267ec681f3Smrg src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]); 6277ec681f3Smrg writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1)); 6287ec681f3Smrg 6297ec681f3Smrg //! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a 6307ec681f3Smrg //! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d 6317ec681f3Smrg //! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0 6327ec681f3Smrg //! p_unit_test 9, %res9 6337ec681f3Smrg src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]); 6347ec681f3Smrg src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]); 6357ec681f3Smrg writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1)); 6367ec681f3Smrg 6377ec681f3Smrg /* bit sizes */ 6387ec681f3Smrg //! s2: %res10 = v_cmp_nge_f16 4.0, %b 6397ec681f3Smrg //! p_unit_test 10, %res10 6407ec681f3Smrg Temp input1_16 = 6417ec681f3Smrg bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand::zero()); 6427ec681f3Smrg writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), 6437ec681f3Smrg bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16), 6447ec681f3Smrg bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(0x4400u), 6457ec681f3Smrg input1_16))); 6467ec681f3Smrg 6477ec681f3Smrg //! s2: %res11 = v_cmp_nge_f64 4.0, %c 6487ec681f3Smrg //! p_unit_test 11, %res11 6497ec681f3Smrg writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), 6507ec681f3Smrg bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]), 6517ec681f3Smrg bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), 6527ec681f3Smrg Operand::c64(0x4010000000000000u), inputs[2]))); 6537ec681f3Smrg 6547ec681f3Smrg /* NaN */ 6557ec681f3Smrg uint16_t nan16 = 0x7e00; 6567ec681f3Smrg uint32_t nan32 = 0x7fc00000; 6577ec681f3Smrg uint64_t nan64 = 0xffffffffffffffffllu; 6587ec681f3Smrg 6597ec681f3Smrg //! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a 6607ec681f3Smrg //! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a 6617ec681f3Smrg //! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0 6627ec681f3Smrg //! p_unit_test 12, %res12 6637ec681f3Smrg src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(nan16), inputs[0]); 6647ec681f3Smrg src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]); 6657ec681f3Smrg writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1)); 6667ec681f3Smrg 6677ec681f3Smrg //! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a 6687ec681f3Smrg //! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a 6697ec681f3Smrg //! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0 6707ec681f3Smrg //! p_unit_test 13, %res13 6717ec681f3Smrg src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(nan32), inputs[0]); 6727ec681f3Smrg src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]); 6737ec681f3Smrg writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1)); 6747ec681f3Smrg 6757ec681f3Smrg //! s2: %tmp14_0 = v_cmp_lt_f64 -1, %a 6767ec681f3Smrg //! s2: %tmp14_1 = v_cmp_neq_f64 %a, %a 6777ec681f3Smrg //! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0 6787ec681f3Smrg //! p_unit_test 14, %res14 6797ec681f3Smrg src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand::c64(nan64), inputs[0]); 6807ec681f3Smrg src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[0], inputs[0]); 6817ec681f3Smrg writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1)); 6827ec681f3Smrg 6837ec681f3Smrg finish_opt_test(); 6847ec681f3SmrgEND_TEST 6857ec681f3Smrg 6867ec681f3SmrgBEGIN_TEST(optimize.add3) 6877ec681f3Smrg //>> v1: %a, v1: %b, v1: %c = p_startpgm 6887ec681f3Smrg if (!setup_cs("v1 v1 v1", GFX9)) 6897ec681f3Smrg return; 6907ec681f3Smrg 6917ec681f3Smrg //! v1: %res0 = v_add3_u32 %a, %b, %c 6927ec681f3Smrg //! p_unit_test 0, %res0 6937ec681f3Smrg Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]); 6947ec681f3Smrg writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp)); 6957ec681f3Smrg 6967ec681f3Smrg //! v1: %tmp1 = v_add_u32 %b, %c clamp 6977ec681f3Smrg //! v1: %res1 = v_add_u32 %a, %tmp1 6987ec681f3Smrg //! p_unit_test 1, %res1 6997ec681f3Smrg tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]); 7007ec681f3Smrg tmp.instr->vop3().clamp = true; 7017ec681f3Smrg writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp)); 7027ec681f3Smrg 7037ec681f3Smrg //! v1: %tmp2 = v_add_u32 %b, %c 7047ec681f3Smrg //! v1: %res2 = v_add_u32 %a, %tmp2 clamp 7057ec681f3Smrg //! p_unit_test 2, %res2 7067ec681f3Smrg tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]); 7077ec681f3Smrg tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp); 7087ec681f3Smrg tmp.instr->vop3().clamp = true; 7097ec681f3Smrg writeout(2, tmp); 7107ec681f3Smrg 7117ec681f3Smrg finish_opt_test(); 7127ec681f3SmrgEND_TEST 7137ec681f3Smrg 7147ec681f3SmrgBEGIN_TEST(optimize.minmax) 7157ec681f3Smrg for (unsigned i = GFX9; i <= GFX10; i++) { 7167ec681f3Smrg //>> v1: %a = p_startpgm 7177ec681f3Smrg if (!setup_cs("v1", (chip_class)i)) 7187ec681f3Smrg continue; 7197ec681f3Smrg 7207ec681f3Smrg //! v1: %res0 = v_max3_f32 0, -0, %a 7217ec681f3Smrg //! p_unit_test 0, %res0 7227ec681f3Smrg Temp xor0 = fneg(inputs[0]); 7237ec681f3Smrg Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0); 7247ec681f3Smrg Temp xor1 = fneg(min); 7257ec681f3Smrg writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1)); 7267ec681f3Smrg 7277ec681f3Smrg //! v1: %res1 = v_max3_f32 0, -0, -%a 7287ec681f3Smrg //! p_unit_test 1, %res1 7297ec681f3Smrg min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0])); 7307ec681f3Smrg xor1 = fneg(min); 7317ec681f3Smrg writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1)); 7327ec681f3Smrg 7337ec681f3Smrg finish_opt_test(); 7347ec681f3Smrg } 7357ec681f3SmrgEND_TEST 7367ec681f3Smrg 7377ec681f3SmrgBEGIN_TEST(optimize.mad_32_24) 7387ec681f3Smrg for (unsigned i = GFX8; i <= GFX9; i++) { 7397ec681f3Smrg //>> v1: %a, v1: %b, v1: %c = p_startpgm 7407ec681f3Smrg if (!setup_cs("v1 v1 v1", (chip_class)i)) 7417ec681f3Smrg continue; 7427ec681f3Smrg 7437ec681f3Smrg //! v1: %res0 = v_mad_u32_u24 %b, %c, %a 7447ec681f3Smrg //! p_unit_test 0, %res0 7457ec681f3Smrg Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]); 7467ec681f3Smrg writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul)); 7477ec681f3Smrg 7487ec681f3Smrg //! v1: %res1_tmp = v_mul_u32_u24 %b, %c 7497ec681f3Smrg //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp 7507ec681f3Smrg //! p_unit_test 1, %res1 7517ec681f3Smrg mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]); 7527ec681f3Smrg writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp()); 7537ec681f3Smrg 7547ec681f3Smrg finish_opt_test(); 7557ec681f3Smrg } 7567ec681f3SmrgEND_TEST 7577ec681f3Smrg 7587ec681f3SmrgBEGIN_TEST(optimize.add_lshlrev) 7597ec681f3Smrg for (unsigned i = GFX8; i <= GFX10; i++) { 7607ec681f3Smrg //>> v1: %a, v1: %b, s1: %c = p_startpgm 7617ec681f3Smrg if (!setup_cs("v1 v1 s1", (chip_class)i)) 7627ec681f3Smrg continue; 7637ec681f3Smrg 7647ec681f3Smrg Temp lshl; 7657ec681f3Smrg 7667ec681f3Smrg //~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a 7677ec681f3Smrg //~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b 7687ec681f3Smrg //~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b 7697ec681f3Smrg //! p_unit_test 0, %res0 7707ec681f3Smrg lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0])); 7717ec681f3Smrg writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 7727ec681f3Smrg 7737ec681f3Smrg //~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a 7747ec681f3Smrg //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b 7757ec681f3Smrg //~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b 7767ec681f3Smrg //! p_unit_test 1, %res1 7777ec681f3Smrg Operand a_24bit = Operand(inputs[0]); 7787ec681f3Smrg a_24bit.set24bit(true); 7797ec681f3Smrg lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit); 7807ec681f3Smrg writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 7817ec681f3Smrg 7827ec681f3Smrg //~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b 7837ec681f3Smrg //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b 7847ec681f3Smrg //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b 7857ec681f3Smrg //! p_unit_test 2, %res2 7867ec681f3Smrg Operand b_24bit = Operand(inputs[1]); 7877ec681f3Smrg b_24bit.set24bit(true); 7887ec681f3Smrg lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit); 7897ec681f3Smrg writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 7907ec681f3Smrg 7917ec681f3Smrg //~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b 7927ec681f3Smrg //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b 7937ec681f3Smrg //! p_unit_test 3, %res3 7947ec681f3Smrg lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit); 7957ec681f3Smrg writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 7967ec681f3Smrg 7977ec681f3Smrg //~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b 7987ec681f3Smrg //~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b 7997ec681f3Smrg //! p_unit_test 4, %res4 8007ec681f3Smrg Operand a_16bit = Operand(inputs[0]); 8017ec681f3Smrg a_16bit.set16bit(true); 8027ec681f3Smrg lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit); 8037ec681f3Smrg writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]))); 8047ec681f3Smrg 8057ec681f3Smrg //~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c 8067ec681f3Smrg //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c 8077ec681f3Smrg //! p_unit_test 5, %res5 8087ec681f3Smrg Operand c_24bit = Operand(inputs[2]); 8097ec681f3Smrg c_24bit.set24bit(true); 8107ec681f3Smrg lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit); 8117ec681f3Smrg writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2]))); 8127ec681f3Smrg 8137ec681f3Smrg finish_opt_test(); 8147ec681f3Smrg } 8157ec681f3SmrgEND_TEST 8167ec681f3Smrg 8177ec681f3Smrgenum denorm_op { 8187ec681f3Smrg denorm_mul1 = 0, 8197ec681f3Smrg denorm_fneg = 1, 8207ec681f3Smrg denorm_fabs = 2, 8217ec681f3Smrg denorm_fnegabs = 3, 8227ec681f3Smrg}; 8237ec681f3Smrg 8247ec681f3Smrgstatic const char *denorm_op_names[] = { 8257ec681f3Smrg "mul1", 8267ec681f3Smrg "fneg", 8277ec681f3Smrg "fabs", 8287ec681f3Smrg "fnegabs", 8297ec681f3Smrg}; 8307ec681f3Smrg 8317ec681f3Smrgstruct denorm_config { 8327ec681f3Smrg bool flush; 8337ec681f3Smrg unsigned op; 8347ec681f3Smrg aco_opcode src; 8357ec681f3Smrg aco_opcode dest; 8367ec681f3Smrg}; 8377ec681f3Smrg 8387ec681f3Smrgstatic const char *srcdest_op_name(aco_opcode op) 8397ec681f3Smrg{ 8407ec681f3Smrg switch (op) { 8417ec681f3Smrg case aco_opcode::v_cndmask_b32: 8427ec681f3Smrg return "cndmask"; 8437ec681f3Smrg case aco_opcode::v_min_f32: 8447ec681f3Smrg return "min"; 8457ec681f3Smrg case aco_opcode::v_rcp_f32: 8467ec681f3Smrg return "rcp"; 8477ec681f3Smrg default: 8487ec681f3Smrg return "none"; 8497ec681f3Smrg } 8507ec681f3Smrg} 8517ec681f3Smrg 8527ec681f3Smrgstatic Temp emit_denorm_srcdest(aco_opcode op, Temp val) 8537ec681f3Smrg{ 8547ec681f3Smrg switch (op) { 8557ec681f3Smrg case aco_opcode::v_cndmask_b32: 8567ec681f3Smrg return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]); 8577ec681f3Smrg case aco_opcode::v_min_f32: 8587ec681f3Smrg return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val); 8597ec681f3Smrg case aco_opcode::v_rcp_f32: 8607ec681f3Smrg return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val); 8617ec681f3Smrg default: 8627ec681f3Smrg return val; 8637ec681f3Smrg } 8647ec681f3Smrg} 8657ec681f3Smrg 8667ec681f3SmrgBEGIN_TEST(optimize.denorm_propagation) 8677ec681f3Smrg for (unsigned i = GFX8; i <= GFX9; i++) { 8687ec681f3Smrg std::vector<denorm_config> configs; 8697ec681f3Smrg for (bool flush : {false, true}) { 8707ec681f3Smrg for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs}) 8717ec681f3Smrg configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes}); 8727ec681f3Smrg 8737ec681f3Smrg for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) { 8747ec681f3Smrg for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs}) 8757ec681f3Smrg configs.push_back({flush, op, aco_opcode::num_opcodes, dest}); 8767ec681f3Smrg } 8777ec681f3Smrg 8787ec681f3Smrg for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) { 8797ec681f3Smrg for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs}) 8807ec681f3Smrg configs.push_back({flush, op, src, aco_opcode::num_opcodes}); 8817ec681f3Smrg } 8827ec681f3Smrg } 8837ec681f3Smrg 8847ec681f3Smrg for (denorm_config cfg : configs) { 8857ec681f3Smrg char subvariant[128]; 8867ec681f3Smrg sprintf(subvariant, "_%s_%s_%s_%s", 8877ec681f3Smrg cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src), 8887ec681f3Smrg denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest)); 8897ec681f3Smrg if (!setup_cs("v1 s2", (chip_class)i, CHIP_UNKNOWN, subvariant)) 8907ec681f3Smrg continue; 8917ec681f3Smrg 8927ec681f3Smrg bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) || 8937ec681f3Smrg cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) || 8947ec681f3Smrg !cfg.flush; 8957ec681f3Smrg 8967ec681f3Smrg fprintf(output, "src, dest, op: %s %s %s\n", 8977ec681f3Smrg srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]); 8987ec681f3Smrg fprintf(output, "can_propagate: %u\n", can_propagate); 8997ec681f3Smrg //! src, dest, op: $src $dest $op 9007ec681f3Smrg //! can_propagate: #can_propagate 9017ec681f3Smrg //>> v1: %a, s2: %b = p_startpgm 9027ec681f3Smrg 9037ec681f3Smrg //; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b', 9047ec681f3Smrg //; 'min': 'v1: %{} = v_min_f32 0, {}', 9057ec681f3Smrg //; 'rcp': 'v1: %{} = v_rcp_f32 {}'} 9067ec681f3Smrg //; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}', 9077ec681f3Smrg //; 'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}', 9087ec681f3Smrg //; 'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|', 9097ec681f3Smrg //; 'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'} 9107ec681f3Smrg //; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'} 9117ec681f3Smrg 9127ec681f3Smrg //; name = 'a' 9137ec681f3Smrg //; if src != 'none': 9147ec681f3Smrg //; insert_pattern(patterns[src].format('src_res', '%'+name)) 9157ec681f3Smrg //; name = 'src_res' 9167ec681f3Smrg 9177ec681f3Smrg //; if can_propagate: 9187ec681f3Smrg //; name = inline_ops[op].format(name) 9197ec681f3Smrg //; else: 9207ec681f3Smrg //; insert_pattern(ops[op].format('op_res', name)) 9217ec681f3Smrg //; name = '%op_res' 9227ec681f3Smrg 9237ec681f3Smrg //; if dest != 'none': 9247ec681f3Smrg //; insert_pattern(patterns[dest].format('dest_res', name)) 9257ec681f3Smrg //; name = '%dest_res' 9267ec681f3Smrg 9277ec681f3Smrg //; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name)) 9287ec681f3Smrg //! p_unit_test 0, %res 9297ec681f3Smrg 9307ec681f3Smrg program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep; 9317ec681f3Smrg 9327ec681f3Smrg Temp val = emit_denorm_srcdest(cfg.src, inputs[0]); 9337ec681f3Smrg switch (cfg.op) { 9347ec681f3Smrg case denorm_mul1: 9357ec681f3Smrg val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val); 9367ec681f3Smrg break; 9377ec681f3Smrg case denorm_fneg: 9387ec681f3Smrg val = fneg(val); 9397ec681f3Smrg break; 9407ec681f3Smrg case denorm_fabs: 9417ec681f3Smrg val = fabs(val); 9427ec681f3Smrg break; 9437ec681f3Smrg case denorm_fnegabs: 9447ec681f3Smrg val = fneg(fabs(val)); 9457ec681f3Smrg break; 9467ec681f3Smrg } 9477ec681f3Smrg val = emit_denorm_srcdest(cfg.dest, val); 9487ec681f3Smrg writeout( 9497ec681f3Smrg 0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1])); 9507ec681f3Smrg 9517ec681f3Smrg finish_opt_test(); 9527ec681f3Smrg } 9537ec681f3Smrg } 9547ec681f3SmrgEND_TEST 9557ec681f3Smrg 9567ec681f3SmrgBEGIN_TEST(optimizer.dpp) 9577ec681f3Smrg //>> v1: %a, v1: %b, s2: %c, s1: %d = p_startpgm 9587ec681f3Smrg if (!setup_cs("v1 v1 s2 s1", GFX10_3)) 9597ec681f3Smrg return; 9607ec681f3Smrg 9617ec681f3Smrg Operand a(inputs[0]); 9627ec681f3Smrg Operand b(inputs[1]); 9637ec681f3Smrg Operand c(inputs[2]); 9647ec681f3Smrg Operand d(inputs[3]); 9657ec681f3Smrg 9667ec681f3Smrg /* basic optimization */ 9677ec681f3Smrg //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1 9687ec681f3Smrg //! p_unit_test 0, %res0 9697ec681f3Smrg Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 9707ec681f3Smrg Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b); 9717ec681f3Smrg writeout(0, res0); 9727ec681f3Smrg 9737ec681f3Smrg /* operand swapping */ 9747ec681f3Smrg //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1 9757ec681f3Smrg //! p_unit_test 1, %res1 9767ec681f3Smrg Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 9777ec681f3Smrg Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1); 9787ec681f3Smrg writeout(1, res1); 9797ec681f3Smrg 9807ec681f3Smrg //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1 9817ec681f3Smrg //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1 9827ec681f3Smrg //! p_unit_test 2, %res2 9837ec681f3Smrg Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 9847ec681f3Smrg Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror); 9857ec681f3Smrg writeout(2, res2); 9867ec681f3Smrg 9877ec681f3Smrg /* modifiers */ 9887ec681f3Smrg //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 9897ec681f3Smrg //! p_unit_test 3, %res3 9907ec681f3Smrg auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 9917ec681f3Smrg tmp3.instr->dpp().neg[0] = true; 9927ec681f3Smrg Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b); 9937ec681f3Smrg writeout(3, res3); 9947ec681f3Smrg 9957ec681f3Smrg //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 9967ec681f3Smrg //! p_unit_test 4, %res4 9977ec681f3Smrg Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 9987ec681f3Smrg auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b); 9997ec681f3Smrg res4.instr->vop3().neg[0] = true; 10007ec681f3Smrg writeout(4, res4); 10017ec681f3Smrg 10027ec681f3Smrg //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1 10037ec681f3Smrg //! v1: %res5 = v_add_f32 %tmp5, %b clamp 10047ec681f3Smrg //! p_unit_test 5, %res5 10057ec681f3Smrg Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 10067ec681f3Smrg auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b); 10077ec681f3Smrg res5.instr->vop3().clamp = true; 10087ec681f3Smrg writeout(5, res5); 10097ec681f3Smrg 10107ec681f3Smrg //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1 10117ec681f3Smrg //! p_unit_test 6, %res6 10127ec681f3Smrg auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 10137ec681f3Smrg tmp6.instr->dpp().neg[0] = true; 10147ec681f3Smrg auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b); 10157ec681f3Smrg res6.instr->vop3().abs[0] = true; 10167ec681f3Smrg writeout(6, res6); 10177ec681f3Smrg 10187ec681f3Smrg //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1 10197ec681f3Smrg //! p_unit_test 7, %res7 10207ec681f3Smrg Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 10217ec681f3Smrg auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7); 10227ec681f3Smrg res7.instr->vop3().abs[0] = true; 10237ec681f3Smrg writeout(7, res7); 10247ec681f3Smrg 10257ec681f3Smrg /* vcc */ 10267ec681f3Smrg //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1 10277ec681f3Smrg //! p_unit_test 8, %res8 10287ec681f3Smrg Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 10297ec681f3Smrg Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c); 10307ec681f3Smrg writeout(8, res8); 10317ec681f3Smrg 10327ec681f3Smrg /* sgprs */ 10337ec681f3Smrg //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1 10347ec681f3Smrg //! v1: %res9 = v_add_f32 %tmp9, %d 10357ec681f3Smrg //! p_unit_test 9, %res9 10367ec681f3Smrg Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 10377ec681f3Smrg Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d); 10387ec681f3Smrg writeout(9, res9); 10397ec681f3Smrg 10407ec681f3Smrg //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1 10417ec681f3Smrg //! v1: %res10 = v_add_f32 %d, %tmp10 10427ec681f3Smrg //! p_unit_test 10, %res10 10437ec681f3Smrg Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror); 10447ec681f3Smrg Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), d, tmp10); 10457ec681f3Smrg writeout(10, res10); 10467ec681f3Smrg 10477ec681f3Smrg finish_opt_test(); 10487ec681f3SmrgEND_TEST 10497ec681f3Smrg 10507ec681f3SmrgBEGIN_TEST(optimize.dpp_prop) 10517ec681f3Smrg //>> v1: %a, s1: %b = p_startpgm 10527ec681f3Smrg if (!setup_cs("v1 s1", GFX10)) 10537ec681f3Smrg return; 10547ec681f3Smrg 10557ec681f3Smrg //! v1: %one = p_parallelcopy 1 10567ec681f3Smrg //! v1: %res0 = v_mul_f32 1, %a 10577ec681f3Smrg //! p_unit_test 0, %res0 10587ec681f3Smrg Temp one = bld.copy(bld.def(v1), Operand::c32(1)); 10597ec681f3Smrg writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1))); 10607ec681f3Smrg 10617ec681f3Smrg //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1 10627ec681f3Smrg //! p_unit_test 1, %res1 10637ec681f3Smrg writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1))); 10647ec681f3Smrg 10657ec681f3Smrg //! v1: %res2 = v_mul_f32 0x12345678, %a 10667ec681f3Smrg //! p_unit_test 2, %res2 10677ec681f3Smrg Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u)); 10687ec681f3Smrg writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1))); 10697ec681f3Smrg 10707ec681f3Smrg //! v1: %literal2 = p_parallelcopy 0x12345679 10717ec681f3Smrg //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1 10727ec681f3Smrg //! p_unit_test 3, %res3 10737ec681f3Smrg Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u)); 10747ec681f3Smrg writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1))); 10757ec681f3Smrg 10767ec681f3Smrg //! v1: %b_v = p_parallelcopy %b 10777ec681f3Smrg //! v1: %res4 = v_mul_f32 %b, %a 10787ec681f3Smrg //! p_unit_test 4, %res4 10797ec681f3Smrg Temp b_v = bld.copy(bld.def(v1), inputs[1]); 10807ec681f3Smrg writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1))); 10817ec681f3Smrg 10827ec681f3Smrg //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1 10837ec681f3Smrg //! p_unit_test 5, %res5 10847ec681f3Smrg writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1))); 10857ec681f3Smrg 10867ec681f3Smrg //! v1: %res6 = v_rcp_f32 %b 10877ec681f3Smrg //! p_unit_test 6, %res6 10887ec681f3Smrg writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1))); 10897ec681f3Smrg 10907ec681f3Smrg finish_opt_test(); 10917ec681f3SmrgEND_TEST 10927ec681f3Smrg 1093