17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2020 Valve Corporation
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
217ec681f3Smrg * IN THE SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg */
247ec681f3Smrg#include "helpers.h"
257ec681f3Smrg
267ec681f3Smrgusing namespace aco;
277ec681f3Smrg
287ec681f3SmrgBEGIN_TEST(optimize.neg)
297ec681f3Smrg   for (unsigned i = GFX9; i <= GFX10; i++) {
307ec681f3Smrg      //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
317ec681f3Smrg      if (!setup_cs("v1 v1 s1 s1", (chip_class)i))
327ec681f3Smrg         continue;
337ec681f3Smrg
347ec681f3Smrg      //! v1: %res0 = v_mul_f32 %a, -%b
357ec681f3Smrg      //! p_unit_test 0, %res0
367ec681f3Smrg      Temp neg_b = fneg(inputs[1]);
377ec681f3Smrg      writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
387ec681f3Smrg
397ec681f3Smrg      //~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
407ec681f3Smrg      //~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
417ec681f3Smrg      //~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
427ec681f3Smrg      //! p_unit_test 1, %res1
437ec681f3Smrg      Temp neg_a = fneg(inputs[0]);
447ec681f3Smrg      writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
457ec681f3Smrg
467ec681f3Smrg      //! v1: %res2 = v_mul_f32 %a, %b
477ec681f3Smrg      //! p_unit_test 2, %res2
487ec681f3Smrg      Temp neg_neg_a = fneg(neg_a);
497ec681f3Smrg      writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
507ec681f3Smrg
517ec681f3Smrg      //! v1: %res3 = v_mul_f32 |%a|, %b
527ec681f3Smrg      //! p_unit_test 3, %res3
537ec681f3Smrg      Temp abs_neg_a = fabs(neg_a);
547ec681f3Smrg      writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
557ec681f3Smrg
567ec681f3Smrg      //! v1: %res4 = v_mul_f32 -|%a|, %b
577ec681f3Smrg      //! p_unit_test 4, %res4
587ec681f3Smrg      Temp abs_a = fabs(inputs[0]);
597ec681f3Smrg      Temp neg_abs_a = fneg(abs_a);
607ec681f3Smrg      writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
617ec681f3Smrg
627ec681f3Smrg      //! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
637ec681f3Smrg      //! p_unit_test 5, %res5
647ec681f3Smrg      writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
657ec681f3Smrg
667ec681f3Smrg      //! v1: %res6 = v_subrev_f32 %a, %b
677ec681f3Smrg      //! p_unit_test 6, %res6
687ec681f3Smrg      writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
697ec681f3Smrg
707ec681f3Smrg      //! v1: %res7 = v_sub_f32 %b, %a
717ec681f3Smrg      //! p_unit_test 7, %res7
727ec681f3Smrg      writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
737ec681f3Smrg
747ec681f3Smrg      //! v1: %res8 = v_mul_f32 %a, -%c
757ec681f3Smrg      //! p_unit_test 8, %res8
767ec681f3Smrg      Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
777ec681f3Smrg      writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
787ec681f3Smrg
797ec681f3Smrg      // //! v1: %res9 = v_mul_f32 |%neg_a|, %b
807ec681f3Smrg      // //! p_unit_test 9, %res9
817ec681f3Smrg      Temp abs_neg_abs_a = fabs(neg_abs_a);
827ec681f3Smrg      writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
837ec681f3Smrg
847ec681f3Smrg      finish_opt_test();
857ec681f3Smrg   }
867ec681f3SmrgEND_TEST
877ec681f3Smrg
887ec681f3SmrgBEGIN_TEST(optimize.output_modifiers)
897ec681f3Smrg   //>> v1: %a, v1: %b = p_startpgm
907ec681f3Smrg   if (!setup_cs("v1 v1", GFX9))
917ec681f3Smrg      return;
927ec681f3Smrg
937ec681f3Smrg   program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
947ec681f3Smrg
957ec681f3Smrg   /* 32-bit modifiers */
967ec681f3Smrg
977ec681f3Smrg   //! v1: %res0 = v_add_f32 %a, %b *0.5
987ec681f3Smrg   //! p_unit_test 0, %res0
997ec681f3Smrg   Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
1007ec681f3Smrg   writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
1017ec681f3Smrg
1027ec681f3Smrg   //! v1: %res1 = v_add_f32 %a, %b *2
1037ec681f3Smrg   //! p_unit_test 1, %res1
1047ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
1057ec681f3Smrg   writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
1067ec681f3Smrg
1077ec681f3Smrg   //! v1: %res2 = v_add_f32 %a, %b *4
1087ec681f3Smrg   //! p_unit_test 2, %res2
1097ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
1107ec681f3Smrg   writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
1117ec681f3Smrg
1127ec681f3Smrg   //! v1: %res3 = v_add_f32 %a, %b clamp
1137ec681f3Smrg   //! p_unit_test 3, %res3
1147ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
1157ec681f3Smrg   writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
1167ec681f3Smrg                        Operand::c32(0x3f800000u), tmp));
1177ec681f3Smrg
1187ec681f3Smrg   //! v1: %res4 = v_add_f32 %a, %b *2 clamp
1197ec681f3Smrg   //! p_unit_test 4, %res4
1207ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
1217ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
1227ec681f3Smrg   writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
1237ec681f3Smrg                        Operand::c32(0x3f800000u), tmp));
1247ec681f3Smrg
1257ec681f3Smrg   /* 16-bit modifiers */
1267ec681f3Smrg
1277ec681f3Smrg   //! v2b: %res5 = v_add_f16 %a, %b *0.5
1287ec681f3Smrg   //! p_unit_test 5, %res5
1297ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
1307ec681f3Smrg   writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));
1317ec681f3Smrg
1327ec681f3Smrg   //! v2b: %res6 = v_add_f16 %a, %b *2
1337ec681f3Smrg   //! p_unit_test 6, %res6
1347ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
1357ec681f3Smrg   writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
1367ec681f3Smrg
1377ec681f3Smrg   //! v2b: %res7 = v_add_f16 %a, %b *4
1387ec681f3Smrg   //! p_unit_test 7, %res7
1397ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
1407ec681f3Smrg   writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));
1417ec681f3Smrg
1427ec681f3Smrg   //! v2b: %res8 = v_add_f16 %a, %b clamp
1437ec681f3Smrg   //! p_unit_test 8, %res8
1447ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
1457ec681f3Smrg   writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
1467ec681f3Smrg                        Operand::c16(0x3c00u), tmp));
1477ec681f3Smrg
1487ec681f3Smrg   //! v2b: %res9 = v_add_f16 %a, %b *2 clamp
1497ec681f3Smrg   //! p_unit_test 9, %res9
1507ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
1517ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);
1527ec681f3Smrg   writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
1537ec681f3Smrg                        Operand::c16(0x3c00u), tmp));
1547ec681f3Smrg
1557ec681f3Smrg   /* clamping is done after omod */
1567ec681f3Smrg
1577ec681f3Smrg   //! v1: %res10_tmp = v_add_f32 %a, %b clamp
1587ec681f3Smrg   //! v1: %res10 = v_mul_f32 2.0, %res10_tmp
1597ec681f3Smrg   //! p_unit_test 10, %res10
1607ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
1617ec681f3Smrg   tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
1627ec681f3Smrg                  tmp);
1637ec681f3Smrg   writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
1647ec681f3Smrg
1657ec681f3Smrg   /* unsupported instructions */
1667ec681f3Smrg
1677ec681f3Smrg   //! v1: %res11_tmp = v_xor_b32 %a, %b
1687ec681f3Smrg   //! v1: %res11 = v_mul_f32 2.0, %res11_tmp
1697ec681f3Smrg   //! p_unit_test 11, %res11
1707ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
1717ec681f3Smrg   writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
1727ec681f3Smrg
1737ec681f3Smrg   /* several users */
1747ec681f3Smrg
1757ec681f3Smrg   //! v1: %res12_tmp = v_add_f32 %a, %b
1767ec681f3Smrg   //! p_unit_test %res12_tmp
1777ec681f3Smrg   //! v1: %res12 = v_mul_f32 2.0, %res12_tmp
1787ec681f3Smrg   //! p_unit_test 12, %res12
1797ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
1807ec681f3Smrg   bld.pseudo(aco_opcode::p_unit_test, tmp);
1817ec681f3Smrg   writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
1827ec681f3Smrg
1837ec681f3Smrg   //! v1: %res13 = v_add_f32 %a, %b
1847ec681f3Smrg   //! p_unit_test 13, %res13
1857ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
1867ec681f3Smrg   bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
1877ec681f3Smrg   writeout(13, tmp);
1887ec681f3Smrg
1897ec681f3Smrg   /* omod has no effect if denormals are enabled but clamp is fine */
1907ec681f3Smrg
1917ec681f3Smrg   //>> BB1
1927ec681f3Smrg   //! /* logical preds: / linear preds: / kind: uniform, */
1937ec681f3Smrg   program->next_fp_mode.denorm32 = fp_denorm_keep;
1947ec681f3Smrg   program->next_fp_mode.denorm16_64 = fp_denorm_flush;
1957ec681f3Smrg   bld.reset(program->create_and_insert_block());
1967ec681f3Smrg
1977ec681f3Smrg   //! v1: %res14_tmp = v_add_f32 %a, %b
1987ec681f3Smrg   //! v1: %res14 = v_mul_f32 2.0, %res13_tmp
1997ec681f3Smrg   //! p_unit_test 14, %res14
2007ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
2017ec681f3Smrg   writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
2027ec681f3Smrg
2037ec681f3Smrg   //! v1: %res15 = v_add_f32 %a, %b clamp
2047ec681f3Smrg   //! p_unit_test 15, %res15
2057ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
2067ec681f3Smrg   writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
2077ec681f3Smrg                         Operand::c32(0x3f800000u), tmp));
2087ec681f3Smrg
2097ec681f3Smrg   //>> BB2
2107ec681f3Smrg   //! /* logical preds: / linear preds: / kind: uniform, */
2117ec681f3Smrg   program->next_fp_mode.denorm32 = fp_denorm_flush;
2127ec681f3Smrg   program->next_fp_mode.denorm16_64 = fp_denorm_keep;
2137ec681f3Smrg   bld.reset(program->create_and_insert_block());
2147ec681f3Smrg
2157ec681f3Smrg   //! v2b: %res16_tmp = v_add_f16 %a, %b
2167ec681f3Smrg   //! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
2177ec681f3Smrg   //! p_unit_test 16, %res16
2187ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
2197ec681f3Smrg   writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
2207ec681f3Smrg
2217ec681f3Smrg   //! v2b: %res17 = v_add_f16 %a, %b clamp
2227ec681f3Smrg   //! p_unit_test 17, %res17
2237ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
2247ec681f3Smrg   writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
2257ec681f3Smrg                         Operand::c16(0x3c00u), tmp));
2267ec681f3Smrg
2277ec681f3Smrg   /* omod flushes -0.0 to +0.0 */
2287ec681f3Smrg
2297ec681f3Smrg   //>> BB3
2307ec681f3Smrg   //! /* logical preds: / linear preds: / kind: uniform, */
2317ec681f3Smrg   program->next_fp_mode.denorm32 = fp_denorm_keep;
2327ec681f3Smrg   program->next_fp_mode.denorm16_64 = fp_denorm_keep;
2337ec681f3Smrg   program->next_fp_mode.preserve_signed_zero_inf_nan32 = true;
2347ec681f3Smrg   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
2357ec681f3Smrg   bld.reset(program->create_and_insert_block());
2367ec681f3Smrg
2377ec681f3Smrg   //! v1: %res18_tmp = v_add_f32 %a, %b
2387ec681f3Smrg   //! v1: %res18 = v_mul_f32 2.0, %res18_tmp
2397ec681f3Smrg   //! p_unit_test 18, %res18
2407ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
2417ec681f3Smrg   writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
2427ec681f3Smrg   //! v1: %res19 = v_add_f32 %a, %b clamp
2437ec681f3Smrg   //! p_unit_test 19, %res19
2447ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
2457ec681f3Smrg   writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
2467ec681f3Smrg                         Operand::c32(0x3f800000u), tmp));
2477ec681f3Smrg
2487ec681f3Smrg   //>> BB4
2497ec681f3Smrg   //! /* logical preds: / linear preds: / kind: uniform, */
2507ec681f3Smrg   program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
2517ec681f3Smrg   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true;
2527ec681f3Smrg   bld.reset(program->create_and_insert_block());
2537ec681f3Smrg   //! v2b: %res20_tmp = v_add_f16 %a, %b
2547ec681f3Smrg   //! v2b: %res20 = v_mul_f16 2.0, %res20_tmp
2557ec681f3Smrg   //! p_unit_test 20, %res20
2567ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
2577ec681f3Smrg   writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
2587ec681f3Smrg   //! v2b: %res21 = v_add_f16 %a, %b clamp
2597ec681f3Smrg   //! p_unit_test 21, %res21
2607ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
2617ec681f3Smrg   writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
2627ec681f3Smrg                         Operand::c16(0x3c00u), tmp));
2637ec681f3Smrg
2647ec681f3Smrg   finish_opt_test();
2657ec681f3SmrgEND_TEST
2667ec681f3Smrg
2677ec681f3SmrgTemp create_subbrev_co(Operand op0, Operand op1, Operand op2)
2687ec681f3Smrg{
2697ec681f3Smrg   return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2);
2707ec681f3Smrg}
2717ec681f3Smrg
2727ec681f3SmrgBEGIN_TEST(optimize.cndmask)
2737ec681f3Smrg   for (unsigned i = GFX9; i <= GFX10; i++) {
2747ec681f3Smrg      //>> v1: %a, s1: %b, s2: %c = p_startpgm
2757ec681f3Smrg      if (!setup_cs("v1 s1 s2", (chip_class)i))
2767ec681f3Smrg         continue;
2777ec681f3Smrg
2787ec681f3Smrg      Temp subbrev;
2797ec681f3Smrg
2807ec681f3Smrg      //! v1: %res0 = v_cndmask_b32 0, %a, %c
2817ec681f3Smrg      //! p_unit_test 0, %res0
2827ec681f3Smrg      subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
2837ec681f3Smrg      writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
2847ec681f3Smrg
2857ec681f3Smrg      //! v1: %res1 = v_cndmask_b32 0, 42, %c
2867ec681f3Smrg      //! p_unit_test 1, %res1
2877ec681f3Smrg      subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
2887ec681f3Smrg      writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));
2897ec681f3Smrg
2907ec681f3Smrg      //~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
2917ec681f3Smrg      //~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
2927ec681f3Smrg      //~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
2937ec681f3Smrg      //! p_unit_test 2, %res2
2947ec681f3Smrg      subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
2957ec681f3Smrg      writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
2967ec681f3Smrg
2977ec681f3Smrg      //! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
2987ec681f3Smrg      //! v1: %xor = v_xor_b32 %a, %subbrev1
2997ec681f3Smrg      //! v1: %res3 = v_cndmask_b32 0, %xor, %c
3007ec681f3Smrg      //! p_unit_test 3, %res3
3017ec681f3Smrg      subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
3027ec681f3Smrg      Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
3037ec681f3Smrg      writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
3047ec681f3Smrg
3057ec681f3Smrg      //! v1: %res4 = v_cndmask_b32 0, %a, %c
3067ec681f3Smrg      //! p_unit_test 4, %res4
3077ec681f3Smrg      Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
3087ec681f3Smrg                                  Operand::c32(1u), Operand(inputs[2]));
3097ec681f3Smrg      Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);
3107ec681f3Smrg      writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
3117ec681f3Smrg
3127ec681f3Smrg      finish_opt_test();
3137ec681f3Smrg   }
3147ec681f3SmrgEND_TEST
3157ec681f3Smrg
3167ec681f3SmrgBEGIN_TEST(optimize.add_lshl)
3177ec681f3Smrg   for (unsigned i = GFX8; i <= GFX10; i++) {
3187ec681f3Smrg      //>> s1: %a, v1: %b = p_startpgm
3197ec681f3Smrg      if (!setup_cs("s1 v1", (chip_class)i))
3207ec681f3Smrg         continue;
3217ec681f3Smrg
3227ec681f3Smrg      Temp shift;
3237ec681f3Smrg
3247ec681f3Smrg      //~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
3257ec681f3Smrg      //~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
3267ec681f3Smrg      //~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
3277ec681f3Smrg      //! p_unit_test 0, %res0
3287ec681f3Smrg      shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
3297ec681f3Smrg                       Operand::c32(3u));
3307ec681f3Smrg      writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,
3317ec681f3Smrg                           Operand::c32(4u)));
3327ec681f3Smrg
3337ec681f3Smrg      //~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
3347ec681f3Smrg      //~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
3357ec681f3Smrg      //~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
3367ec681f3Smrg      //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
3377ec681f3Smrg      //~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
3387ec681f3Smrg      //~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
3397ec681f3Smrg      //~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
3407ec681f3Smrg      //! p_unit_test 1, %res1
3417ec681f3Smrg      shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
3427ec681f3Smrg                       Operand::c32(3u));
3437ec681f3Smrg      Temp sadd =
3447ec681f3Smrg         bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));
3457ec681f3Smrg      Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
3467ec681f3Smrg      writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
3477ec681f3Smrg
3487ec681f3Smrg      //~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3
3497ec681f3Smrg      //~gfx8! v1: %res2,  s2: %_ = v_add_co_u32 %lshl2, %b
3507ec681f3Smrg      //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
3517ec681f3Smrg      //! p_unit_test 2, %res2
3527ec681f3Smrg      Temp lshl =
3537ec681f3Smrg         bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand::c32(3u));
3547ec681f3Smrg      writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
3557ec681f3Smrg
3567ec681f3Smrg      //~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7
3577ec681f3Smrg      //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
3587ec681f3Smrg      //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
3597ec681f3Smrg      //! p_unit_test 3, %res3
3607ec681f3Smrg      Operand a_24bit = Operand(inputs[0]);
3617ec681f3Smrg      a_24bit.set24bit(true);
3627ec681f3Smrg      lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(7u));
3637ec681f3Smrg      writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
3647ec681f3Smrg
3657ec681f3Smrg      //! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3
3667ec681f3Smrg      //~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
3677ec681f3Smrg      //~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
3687ec681f3Smrg      //! p_unit_test 4, %carry
3697ec681f3Smrg      lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
3707ec681f3Smrg      Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
3717ec681f3Smrg      writeout(4, carry);
3727ec681f3Smrg
3737ec681f3Smrg      //~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a
3747ec681f3Smrg      //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
3757ec681f3Smrg      //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
3767ec681f3Smrg      //! p_unit_test 5, %res5
3777ec681f3Smrg      lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit);
3787ec681f3Smrg      writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
3797ec681f3Smrg
3807ec681f3Smrg      //~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
3817ec681f3Smrg      //~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
3827ec681f3Smrg      //! p_unit_test 6, %res6
3837ec681f3Smrg      lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
3847ec681f3Smrg      writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
3857ec681f3Smrg
3867ec681f3Smrg      //~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
3877ec681f3Smrg      //~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
3887ec681f3Smrg      //! p_unit_test 7, %res7
3897ec681f3Smrg      Operand a_16bit = Operand(inputs[0]);
3907ec681f3Smrg      a_16bit.set16bit(true);
3917ec681f3Smrg      lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand::c32(4u));
3927ec681f3Smrg      writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
3937ec681f3Smrg
3947ec681f3Smrg      finish_opt_test();
3957ec681f3Smrg   }
3967ec681f3SmrgEND_TEST
3977ec681f3Smrg
3987ec681f3SmrgBEGIN_TEST(optimize.bcnt)
3997ec681f3Smrg   for (unsigned i = GFX8; i <= GFX10; i++) {
4007ec681f3Smrg      //>> v1: %a, s1: %b = p_startpgm
4017ec681f3Smrg      if (!setup_cs("v1 s1", (chip_class)i))
4027ec681f3Smrg         continue;
4037ec681f3Smrg
4047ec681f3Smrg      Temp bcnt;
4057ec681f3Smrg
4067ec681f3Smrg      //! v1: %res0 = v_bcnt_u32_b32 %a, %a
4077ec681f3Smrg      //! p_unit_test 0, %res0
4087ec681f3Smrg      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
4097ec681f3Smrg      writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
4107ec681f3Smrg
4117ec681f3Smrg      //! v1: %res1 = v_bcnt_u32_b32 %a, %b
4127ec681f3Smrg      //! p_unit_test 1, %res1
4137ec681f3Smrg      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
4147ec681f3Smrg      writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
4157ec681f3Smrg
4167ec681f3Smrg      //! v1: %res2 = v_bcnt_u32_b32 %a, 42
4177ec681f3Smrg      //! p_unit_test 2, %res2
4187ec681f3Smrg      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
4197ec681f3Smrg      writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
4207ec681f3Smrg
4217ec681f3Smrg      //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
4227ec681f3Smrg      //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
4237ec681f3Smrg      //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
4247ec681f3Smrg      //! p_unit_test 3, %res3
4257ec681f3Smrg      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
4267ec681f3Smrg      writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
4277ec681f3Smrg
4287ec681f3Smrg      //! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
4297ec681f3Smrg      //~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
4307ec681f3Smrg      //~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
4317ec681f3Smrg      //! p_unit_test 4, %carry
4327ec681f3Smrg      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
4337ec681f3Smrg      Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
4347ec681f3Smrg      writeout(4, carry);
4357ec681f3Smrg
4367ec681f3Smrg      finish_opt_test();
4377ec681f3Smrg   }
4387ec681f3SmrgEND_TEST
4397ec681f3Smrg
4407ec681f3Smrgstruct clamp_config {
4417ec681f3Smrg   const char *name;
4427ec681f3Smrg   aco_opcode min, max, med3;
4437ec681f3Smrg   Operand lb, ub;
4447ec681f3Smrg};
4457ec681f3Smrg
4467ec681f3Smrgstatic const clamp_config clamp_configs[] = {
4477ec681f3Smrg   /* 0.0, 4.0 */
4487ec681f3Smrg   {"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
4497ec681f3Smrg    Operand::zero(), Operand::c32(0x40800000u)},
4507ec681f3Smrg   {"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
4517ec681f3Smrg    Operand::c16(0u), Operand::c16(0x4400)},
4527ec681f3Smrg   /* -1.0, 0.0 */
4537ec681f3Smrg   {"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
4547ec681f3Smrg    Operand::c32(0xbf800000u), Operand::zero()},
4557ec681f3Smrg   {"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
4567ec681f3Smrg    Operand::c16(0xBC00), Operand::c16(0u)},
4577ec681f3Smrg   /* 0, 3 */
4587ec681f3Smrg   {"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
4597ec681f3Smrg    Operand::zero(), Operand::c32(3u)},
4607ec681f3Smrg   {"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
4617ec681f3Smrg    Operand::c16(0u), Operand::c16(3u)},
4627ec681f3Smrg   {"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
4637ec681f3Smrg    Operand::zero(), Operand::c32(3u)},
4647ec681f3Smrg   {"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
4657ec681f3Smrg    Operand::c16(0u), Operand::c16(3u)},
4667ec681f3Smrg   /* -5, 0 */
4677ec681f3Smrg   {"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
4687ec681f3Smrg    Operand::c32(0xfffffffbu), Operand::zero()},
4697ec681f3Smrg   {"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
4707ec681f3Smrg    Operand::c16(0xfffbu), Operand::c16(0u)},
4717ec681f3Smrg};
4727ec681f3Smrg
4737ec681f3SmrgBEGIN_TEST(optimize.clamp)
4747ec681f3Smrg   for (clamp_config cfg : clamp_configs) {
4757ec681f3Smrg      if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
4767ec681f3Smrg         continue;
4777ec681f3Smrg
4787ec681f3Smrg      //! cfg: @match_func(min max med3 lb ub)
4797ec681f3Smrg      fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
4807ec681f3Smrg      fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
4817ec681f3Smrg      fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
4827ec681f3Smrg      aco_print_operand(&cfg.lb, output);
4837ec681f3Smrg      fprintf(output, " ");
4847ec681f3Smrg      aco_print_operand(&cfg.ub, output);
4857ec681f3Smrg      fprintf(output, "\n");
4867ec681f3Smrg
4877ec681f3Smrg      //>> v1: %a, v1: %b, v1: %c = p_startpgm
4887ec681f3Smrg
4897ec681f3Smrg      //! v1: %res0 = @med3 @ub, @lb, %a
4907ec681f3Smrg      //! p_unit_test 0, %res0
4917ec681f3Smrg      writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
4927ec681f3Smrg                           bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
4937ec681f3Smrg
4947ec681f3Smrg      //! v1: %res1 = @med3 @lb, @ub, %a
4957ec681f3Smrg      //! p_unit_test 1, %res1
4967ec681f3Smrg      writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
4977ec681f3Smrg                           bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
4987ec681f3Smrg
4997ec681f3Smrg      /* min constant must be greater than max constant */
5007ec681f3Smrg      //! v1: %res2_tmp = @min @lb, %a
5017ec681f3Smrg      //! v1: %res2 = @max @ub, %res2_tmp
5027ec681f3Smrg      //! p_unit_test 2, %res2
5037ec681f3Smrg      writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
5047ec681f3Smrg                           bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
5057ec681f3Smrg
5067ec681f3Smrg      //! v1: %res3_tmp = @max @ub, %a
5077ec681f3Smrg      //! v1: %res3 = @min @lb, %res3_tmp
5087ec681f3Smrg      //! p_unit_test 3, %res3
5097ec681f3Smrg      writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
5107ec681f3Smrg                           bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
5117ec681f3Smrg
5127ec681f3Smrg      /* needs two constants */
5137ec681f3Smrg
5147ec681f3Smrg      //! v1: %res4_tmp = @max @lb, %a
5157ec681f3Smrg      //! v1: %res4 = @min %b, %res4_tmp
5167ec681f3Smrg      //! p_unit_test 4, %res4
5177ec681f3Smrg      writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
5187ec681f3Smrg                           bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
5197ec681f3Smrg
5207ec681f3Smrg      //! v1: %res5_tmp = @max %b, %a
5217ec681f3Smrg      //! v1: %res5 = @min @ub, %res5_tmp
5227ec681f3Smrg      //! p_unit_test 5, %res5
5237ec681f3Smrg      writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
5247ec681f3Smrg                           bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
5257ec681f3Smrg
5267ec681f3Smrg      //! v1: %res6_tmp = @max %c, %a
5277ec681f3Smrg      //! v1: %res6 = @min %b, %res6_tmp
5287ec681f3Smrg      //! p_unit_test 6, %res6
5297ec681f3Smrg      writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
5307ec681f3Smrg                           bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
5317ec681f3Smrg
5327ec681f3Smrg      /* correct NaN behaviour with precise */
5337ec681f3Smrg
5347ec681f3Smrg      //! v1: %res7 = @med3 @ub, @lb, %a
5357ec681f3Smrg      //! p_unit_test 7, %res7
5367ec681f3Smrg      Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
5377ec681f3Smrg      max.def(0).setPrecise(true);
5387ec681f3Smrg      Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
5397ec681f3Smrg      max.def(0).setPrecise(true);
5407ec681f3Smrg      writeout(7, min);
5417ec681f3Smrg
5427ec681f3Smrg      //! v1: (precise)%res8_tmp = @min @ub, %a
5437ec681f3Smrg      //! v1: %res8 = @max @lb, %res8_tmp
5447ec681f3Smrg      //! p_unit_test 8, %res8
5457ec681f3Smrg      min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
5467ec681f3Smrg      min.def(0).setPrecise(true);
5477ec681f3Smrg      writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
5487ec681f3Smrg
5497ec681f3Smrg      finish_opt_test();
5507ec681f3Smrg   }
5517ec681f3SmrgEND_TEST
5527ec681f3Smrg
5537ec681f3SmrgBEGIN_TEST(optimize.const_comparison_ordering)
5547ec681f3Smrg   //>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm
5557ec681f3Smrg   if (!setup_cs("v1 v1 v2 v1", GFX9))
5567ec681f3Smrg      return;
5577ec681f3Smrg
5587ec681f3Smrg   /* optimize to unordered comparison */
5597ec681f3Smrg   //! s2: %res0 = v_cmp_nge_f32 4.0, %a
5607ec681f3Smrg   //! p_unit_test 0, %res0
5617ec681f3Smrg   writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
5627ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
5637ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
5647ec681f3Smrg                                 Operand::c32(0x40800000u), inputs[0])));
5657ec681f3Smrg
5667ec681f3Smrg   //! s2: %res1 = v_cmp_nge_f32 4.0, %a
5677ec681f3Smrg   //! p_unit_test 1, %res1
5687ec681f3Smrg   writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
5697ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
5707ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
5717ec681f3Smrg                                 Operand::c32(0x40800000u), inputs[0])));
5727ec681f3Smrg
5737ec681f3Smrg   //! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a
5747ec681f3Smrg   //! p_unit_test 2, %res2
5757ec681f3Smrg   writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
5767ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
5777ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
5787ec681f3Smrg                                 bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
5797ec681f3Smrg
5807ec681f3Smrg   /* optimize to ordered comparison */
5817ec681f3Smrg   //! s2: %res3 = v_cmp_lt_f32 4.0, %a
5827ec681f3Smrg   //! p_unit_test 3, %res3
5837ec681f3Smrg   writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
5847ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
5857ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
5867ec681f3Smrg                                 Operand::c32(0x40800000u), inputs[0])));
5877ec681f3Smrg
5887ec681f3Smrg   //! s2: %res4 = v_cmp_lt_f32 4.0, %a
5897ec681f3Smrg   //! p_unit_test 4, %res4
5907ec681f3Smrg   writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
5917ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
5927ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
5937ec681f3Smrg                                 Operand::c32(0x40800000u), inputs[0])));
5947ec681f3Smrg
5957ec681f3Smrg   //! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a
5967ec681f3Smrg   //! p_unit_test 5, %res5
5977ec681f3Smrg   writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
5987ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
5997ec681f3Smrg                        bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
6007ec681f3Smrg                                 bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
6017ec681f3Smrg
6027ec681f3Smrg   /* similar but unoptimizable expressions */
6037ec681f3Smrg   //! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a
6047ec681f3Smrg   //! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a
6057ec681f3Smrg   //! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0
6067ec681f3Smrg   //! p_unit_test 6, %res6
6077ec681f3Smrg   Temp src1 =
6087ec681f3Smrg      bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
6097ec681f3Smrg   Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
6107ec681f3Smrg   writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
6117ec681f3Smrg
6127ec681f3Smrg   //! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a
6137ec681f3Smrg   //! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a
6147ec681f3Smrg   //! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0
6157ec681f3Smrg   //! p_unit_test 7, %res7
6167ec681f3Smrg   src1 =
6177ec681f3Smrg      bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
6187ec681f3Smrg   src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
6197ec681f3Smrg   writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
6207ec681f3Smrg
6217ec681f3Smrg   //! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d
6227ec681f3Smrg   //! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a
6237ec681f3Smrg   //! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0
6247ec681f3Smrg   //! p_unit_test 8, %res8
6257ec681f3Smrg   src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[3]);
6267ec681f3Smrg   src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
6277ec681f3Smrg   writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
6287ec681f3Smrg
6297ec681f3Smrg   //! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a
6307ec681f3Smrg   //! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d
6317ec681f3Smrg   //! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0
6327ec681f3Smrg   //! p_unit_test 9, %res9
6337ec681f3Smrg   src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
6347ec681f3Smrg   src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]);
6357ec681f3Smrg   writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
6367ec681f3Smrg
6377ec681f3Smrg   /* bit sizes */
6387ec681f3Smrg   //! s2: %res10 = v_cmp_nge_f16 4.0, %b
6397ec681f3Smrg   //! p_unit_test 10, %res10
6407ec681f3Smrg   Temp input1_16 =
6417ec681f3Smrg      bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand::zero());
6427ec681f3Smrg   writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
6437ec681f3Smrg                         bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16),
6447ec681f3Smrg                         bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(0x4400u),
6457ec681f3Smrg                                  input1_16)));
6467ec681f3Smrg
6477ec681f3Smrg   //! s2: %res11 = v_cmp_nge_f64 4.0, %c
6487ec681f3Smrg   //! p_unit_test 11, %res11
6497ec681f3Smrg   writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
6507ec681f3Smrg                         bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]),
6517ec681f3Smrg                         bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm),
6527ec681f3Smrg                                  Operand::c64(0x4010000000000000u), inputs[2])));
6537ec681f3Smrg
6547ec681f3Smrg   /* NaN */
6557ec681f3Smrg   uint16_t nan16 = 0x7e00;
6567ec681f3Smrg   uint32_t nan32 = 0x7fc00000;
6577ec681f3Smrg   uint64_t nan64 = 0xffffffffffffffffllu;
6587ec681f3Smrg
6597ec681f3Smrg   //! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a
6607ec681f3Smrg   //! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a
6617ec681f3Smrg   //! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0
6627ec681f3Smrg   //! p_unit_test 12, %res12
6637ec681f3Smrg   src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(nan16), inputs[0]);
6647ec681f3Smrg   src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]);
6657ec681f3Smrg   writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
6667ec681f3Smrg
6677ec681f3Smrg   //! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a
6687ec681f3Smrg   //! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a
6697ec681f3Smrg   //! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0
6707ec681f3Smrg   //! p_unit_test 13, %res13
6717ec681f3Smrg   src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(nan32), inputs[0]);
6727ec681f3Smrg   src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
6737ec681f3Smrg   writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
6747ec681f3Smrg
6757ec681f3Smrg   //! s2: %tmp14_0 = v_cmp_lt_f64 -1, %a
6767ec681f3Smrg   //! s2: %tmp14_1 = v_cmp_neq_f64 %a, %a
6777ec681f3Smrg   //! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0
6787ec681f3Smrg   //! p_unit_test 14, %res14
6797ec681f3Smrg   src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand::c64(nan64), inputs[0]);
6807ec681f3Smrg   src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[0], inputs[0]);
6817ec681f3Smrg   writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
6827ec681f3Smrg
6837ec681f3Smrg   finish_opt_test();
6847ec681f3SmrgEND_TEST
6857ec681f3Smrg
6867ec681f3SmrgBEGIN_TEST(optimize.add3)
6877ec681f3Smrg   //>> v1: %a, v1: %b, v1: %c = p_startpgm
6887ec681f3Smrg   if (!setup_cs("v1 v1 v1", GFX9))
6897ec681f3Smrg      return;
6907ec681f3Smrg
6917ec681f3Smrg   //! v1: %res0 = v_add3_u32 %a, %b, %c
6927ec681f3Smrg   //! p_unit_test 0, %res0
6937ec681f3Smrg   Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
6947ec681f3Smrg   writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
6957ec681f3Smrg
6967ec681f3Smrg   //! v1: %tmp1 = v_add_u32 %b, %c clamp
6977ec681f3Smrg   //! v1: %res1 = v_add_u32 %a, %tmp1
6987ec681f3Smrg   //! p_unit_test 1, %res1
6997ec681f3Smrg   tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
7007ec681f3Smrg   tmp.instr->vop3().clamp = true;
7017ec681f3Smrg   writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
7027ec681f3Smrg
7037ec681f3Smrg   //! v1: %tmp2 = v_add_u32 %b, %c
7047ec681f3Smrg   //! v1: %res2 = v_add_u32 %a, %tmp2 clamp
7057ec681f3Smrg   //! p_unit_test 2, %res2
7067ec681f3Smrg   tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
7077ec681f3Smrg   tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
7087ec681f3Smrg   tmp.instr->vop3().clamp = true;
7097ec681f3Smrg   writeout(2, tmp);
7107ec681f3Smrg
7117ec681f3Smrg   finish_opt_test();
7127ec681f3SmrgEND_TEST
7137ec681f3Smrg
7147ec681f3SmrgBEGIN_TEST(optimize.minmax)
7157ec681f3Smrg   for (unsigned i = GFX9; i <= GFX10; i++) {
7167ec681f3Smrg      //>> v1: %a = p_startpgm
7177ec681f3Smrg      if (!setup_cs("v1", (chip_class)i))
7187ec681f3Smrg         continue;
7197ec681f3Smrg
7207ec681f3Smrg      //! v1: %res0 = v_max3_f32 0, -0, %a
7217ec681f3Smrg      //! p_unit_test 0, %res0
7227ec681f3Smrg      Temp xor0 = fneg(inputs[0]);
7237ec681f3Smrg      Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0);
7247ec681f3Smrg      Temp xor1 = fneg(min);
7257ec681f3Smrg      writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
7267ec681f3Smrg
7277ec681f3Smrg      //! v1: %res1 = v_max3_f32 0, -0, -%a
7287ec681f3Smrg      //! p_unit_test 1, %res1
7297ec681f3Smrg      min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0]));
7307ec681f3Smrg      xor1 = fneg(min);
7317ec681f3Smrg      writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
7327ec681f3Smrg
7337ec681f3Smrg      finish_opt_test();
7347ec681f3Smrg   }
7357ec681f3SmrgEND_TEST
7367ec681f3Smrg
7377ec681f3SmrgBEGIN_TEST(optimize.mad_32_24)
7387ec681f3Smrg   for (unsigned i = GFX8; i <= GFX9; i++) {
7397ec681f3Smrg      //>> v1: %a, v1: %b, v1: %c = p_startpgm
7407ec681f3Smrg      if (!setup_cs("v1 v1 v1", (chip_class)i))
7417ec681f3Smrg         continue;
7427ec681f3Smrg
7437ec681f3Smrg      //! v1: %res0 = v_mad_u32_u24 %b, %c, %a
7447ec681f3Smrg      //! p_unit_test 0, %res0
7457ec681f3Smrg      Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
7467ec681f3Smrg      writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
7477ec681f3Smrg
7487ec681f3Smrg      //! v1: %res1_tmp = v_mul_u32_u24 %b, %c
7497ec681f3Smrg      //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
7507ec681f3Smrg      //! p_unit_test 1, %res1
7517ec681f3Smrg      mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
7527ec681f3Smrg      writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
7537ec681f3Smrg
7547ec681f3Smrg      finish_opt_test();
7557ec681f3Smrg   }
7567ec681f3SmrgEND_TEST
7577ec681f3Smrg
7587ec681f3SmrgBEGIN_TEST(optimize.add_lshlrev)
7597ec681f3Smrg   for (unsigned i = GFX8; i <= GFX10; i++) {
7607ec681f3Smrg      //>> v1: %a, v1: %b, s1: %c = p_startpgm
7617ec681f3Smrg      if (!setup_cs("v1 v1 s1", (chip_class)i))
7627ec681f3Smrg         continue;
7637ec681f3Smrg
7647ec681f3Smrg      Temp lshl;
7657ec681f3Smrg
7667ec681f3Smrg      //~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
7677ec681f3Smrg      //~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
7687ec681f3Smrg      //~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
7697ec681f3Smrg      //! p_unit_test 0, %res0
7707ec681f3Smrg      lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
7717ec681f3Smrg      writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
7727ec681f3Smrg
7737ec681f3Smrg      //~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
7747ec681f3Smrg      //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
7757ec681f3Smrg      //~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
7767ec681f3Smrg      //! p_unit_test 1, %res1
7777ec681f3Smrg      Operand a_24bit = Operand(inputs[0]);
7787ec681f3Smrg      a_24bit.set24bit(true);
7797ec681f3Smrg      lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
7807ec681f3Smrg      writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
7817ec681f3Smrg
7827ec681f3Smrg      //~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
7837ec681f3Smrg      //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
7847ec681f3Smrg      //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
7857ec681f3Smrg      //! p_unit_test 2, %res2
7867ec681f3Smrg      Operand b_24bit = Operand(inputs[1]);
7877ec681f3Smrg      b_24bit.set24bit(true);
7887ec681f3Smrg      lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
7897ec681f3Smrg      writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
7907ec681f3Smrg
7917ec681f3Smrg      //~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
7927ec681f3Smrg      //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
7937ec681f3Smrg      //! p_unit_test 3, %res3
7947ec681f3Smrg      lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
7957ec681f3Smrg      writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
7967ec681f3Smrg
7977ec681f3Smrg      //~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
7987ec681f3Smrg      //~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
7997ec681f3Smrg      //! p_unit_test 4, %res4
8007ec681f3Smrg      Operand a_16bit = Operand(inputs[0]);
8017ec681f3Smrg      a_16bit.set16bit(true);
8027ec681f3Smrg      lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
8037ec681f3Smrg      writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
8047ec681f3Smrg
8057ec681f3Smrg      //~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c
8067ec681f3Smrg      //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
8077ec681f3Smrg      //! p_unit_test 5, %res5
8087ec681f3Smrg      Operand c_24bit = Operand(inputs[2]);
8097ec681f3Smrg      c_24bit.set24bit(true);
8107ec681f3Smrg      lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
8117ec681f3Smrg      writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
8127ec681f3Smrg
8137ec681f3Smrg      finish_opt_test();
8147ec681f3Smrg   }
8157ec681f3SmrgEND_TEST
8167ec681f3Smrg
8177ec681f3Smrgenum denorm_op {
8187ec681f3Smrg   denorm_mul1 = 0,
8197ec681f3Smrg   denorm_fneg = 1,
8207ec681f3Smrg   denorm_fabs = 2,
8217ec681f3Smrg   denorm_fnegabs = 3,
8227ec681f3Smrg};
8237ec681f3Smrg
8247ec681f3Smrgstatic const char *denorm_op_names[] = {
8257ec681f3Smrg   "mul1",
8267ec681f3Smrg   "fneg",
8277ec681f3Smrg   "fabs",
8287ec681f3Smrg   "fnegabs",
8297ec681f3Smrg};
8307ec681f3Smrg
8317ec681f3Smrgstruct denorm_config {
8327ec681f3Smrg   bool flush;
8337ec681f3Smrg   unsigned op;
8347ec681f3Smrg   aco_opcode src;
8357ec681f3Smrg   aco_opcode dest;
8367ec681f3Smrg};
8377ec681f3Smrg
8387ec681f3Smrgstatic const char *srcdest_op_name(aco_opcode op)
8397ec681f3Smrg{
8407ec681f3Smrg   switch (op) {
8417ec681f3Smrg   case aco_opcode::v_cndmask_b32:
8427ec681f3Smrg      return "cndmask";
8437ec681f3Smrg   case aco_opcode::v_min_f32:
8447ec681f3Smrg      return "min";
8457ec681f3Smrg   case aco_opcode::v_rcp_f32:
8467ec681f3Smrg      return "rcp";
8477ec681f3Smrg   default:
8487ec681f3Smrg      return "none";
8497ec681f3Smrg   }
8507ec681f3Smrg}
8517ec681f3Smrg
8527ec681f3Smrgstatic Temp emit_denorm_srcdest(aco_opcode op, Temp val)
8537ec681f3Smrg{
8547ec681f3Smrg   switch (op) {
8557ec681f3Smrg   case aco_opcode::v_cndmask_b32:
8567ec681f3Smrg      return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
8577ec681f3Smrg   case aco_opcode::v_min_f32:
8587ec681f3Smrg      return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
8597ec681f3Smrg   case aco_opcode::v_rcp_f32:
8607ec681f3Smrg      return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
8617ec681f3Smrg   default:
8627ec681f3Smrg      return val;
8637ec681f3Smrg   }
8647ec681f3Smrg}
8657ec681f3Smrg
8667ec681f3SmrgBEGIN_TEST(optimize.denorm_propagation)
8677ec681f3Smrg   for (unsigned i = GFX8; i <= GFX9; i++) {
8687ec681f3Smrg      std::vector<denorm_config> configs;
8697ec681f3Smrg      for (bool flush : {false, true}) {
8707ec681f3Smrg         for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
8717ec681f3Smrg            configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});
8727ec681f3Smrg
8737ec681f3Smrg         for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
8747ec681f3Smrg            for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
8757ec681f3Smrg               configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
8767ec681f3Smrg         }
8777ec681f3Smrg
8787ec681f3Smrg         for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
8797ec681f3Smrg            for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
8807ec681f3Smrg               configs.push_back({flush, op, src, aco_opcode::num_opcodes});
8817ec681f3Smrg         }
8827ec681f3Smrg      }
8837ec681f3Smrg
8847ec681f3Smrg      for (denorm_config cfg : configs) {
8857ec681f3Smrg         char subvariant[128];
8867ec681f3Smrg         sprintf(subvariant, "_%s_%s_%s_%s",
8877ec681f3Smrg                 cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
8887ec681f3Smrg                 denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
8897ec681f3Smrg         if (!setup_cs("v1 s2", (chip_class)i, CHIP_UNKNOWN, subvariant))
8907ec681f3Smrg            continue;
8917ec681f3Smrg
8927ec681f3Smrg         bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
8937ec681f3Smrg                              cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) ||
8947ec681f3Smrg                              !cfg.flush;
8957ec681f3Smrg
8967ec681f3Smrg         fprintf(output, "src, dest, op: %s %s %s\n",
8977ec681f3Smrg                 srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
8987ec681f3Smrg         fprintf(output, "can_propagate: %u\n", can_propagate);
8997ec681f3Smrg         //! src, dest, op: $src $dest $op
9007ec681f3Smrg         //! can_propagate: #can_propagate
9017ec681f3Smrg         //>> v1: %a, s2: %b = p_startpgm
9027ec681f3Smrg
9037ec681f3Smrg         //; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
9047ec681f3Smrg         //;             'min': 'v1: %{} = v_min_f32 0, {}',
9057ec681f3Smrg         //;             'rcp': 'v1: %{} = v_rcp_f32 {}'}
9067ec681f3Smrg         //; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
9077ec681f3Smrg         //;        'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
9087ec681f3Smrg         //;        'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
9097ec681f3Smrg         //;        'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
9107ec681f3Smrg         //; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}
9117ec681f3Smrg
9127ec681f3Smrg         //; name = 'a'
9137ec681f3Smrg         //; if src != 'none':
9147ec681f3Smrg         //;    insert_pattern(patterns[src].format('src_res', '%'+name))
9157ec681f3Smrg         //;    name = 'src_res'
9167ec681f3Smrg
9177ec681f3Smrg         //; if can_propagate:
9187ec681f3Smrg         //;    name = inline_ops[op].format(name)
9197ec681f3Smrg         //; else:
9207ec681f3Smrg         //;    insert_pattern(ops[op].format('op_res', name))
9217ec681f3Smrg         //;    name = '%op_res'
9227ec681f3Smrg
9237ec681f3Smrg         //; if dest != 'none':
9247ec681f3Smrg         //;    insert_pattern(patterns[dest].format('dest_res', name))
9257ec681f3Smrg         //;    name = '%dest_res'
9267ec681f3Smrg
9277ec681f3Smrg         //; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
9287ec681f3Smrg         //! p_unit_test 0, %res
9297ec681f3Smrg
9307ec681f3Smrg         program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;
9317ec681f3Smrg
9327ec681f3Smrg         Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);
9337ec681f3Smrg         switch (cfg.op) {
9347ec681f3Smrg         case denorm_mul1:
9357ec681f3Smrg            val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
9367ec681f3Smrg            break;
9377ec681f3Smrg         case denorm_fneg:
9387ec681f3Smrg            val = fneg(val);
9397ec681f3Smrg            break;
9407ec681f3Smrg         case denorm_fabs:
9417ec681f3Smrg            val = fabs(val);
9427ec681f3Smrg            break;
9437ec681f3Smrg         case denorm_fnegabs:
9447ec681f3Smrg            val = fneg(fabs(val));
9457ec681f3Smrg            break;
9467ec681f3Smrg         }
9477ec681f3Smrg         val = emit_denorm_srcdest(cfg.dest, val);
9487ec681f3Smrg         writeout(
9497ec681f3Smrg            0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
9507ec681f3Smrg
9517ec681f3Smrg         finish_opt_test();
9527ec681f3Smrg      }
9537ec681f3Smrg   }
9547ec681f3SmrgEND_TEST
9557ec681f3Smrg
9567ec681f3SmrgBEGIN_TEST(optimizer.dpp)
9577ec681f3Smrg   //>> v1: %a, v1: %b, s2: %c, s1: %d = p_startpgm
9587ec681f3Smrg   if (!setup_cs("v1 v1 s2 s1", GFX10_3))
9597ec681f3Smrg      return;
9607ec681f3Smrg
9617ec681f3Smrg   Operand a(inputs[0]);
9627ec681f3Smrg   Operand b(inputs[1]);
9637ec681f3Smrg   Operand c(inputs[2]);
9647ec681f3Smrg   Operand d(inputs[3]);
9657ec681f3Smrg
9667ec681f3Smrg   /* basic optimization */
9677ec681f3Smrg   //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1
9687ec681f3Smrg   //! p_unit_test 0, %res0
9697ec681f3Smrg   Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
9707ec681f3Smrg   Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
9717ec681f3Smrg   writeout(0, res0);
9727ec681f3Smrg
9737ec681f3Smrg   /* operand swapping */
9747ec681f3Smrg   //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1
9757ec681f3Smrg   //! p_unit_test 1, %res1
9767ec681f3Smrg   Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
9777ec681f3Smrg   Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
9787ec681f3Smrg   writeout(1, res1);
9797ec681f3Smrg
9807ec681f3Smrg   //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1
9817ec681f3Smrg   //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1
9827ec681f3Smrg   //! p_unit_test 2, %res2
9837ec681f3Smrg   Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
9847ec681f3Smrg   Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
9857ec681f3Smrg   writeout(2, res2);
9867ec681f3Smrg
9877ec681f3Smrg   /* modifiers */
9887ec681f3Smrg   //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
9897ec681f3Smrg   //! p_unit_test 3, %res3
9907ec681f3Smrg   auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
9917ec681f3Smrg   tmp3.instr->dpp().neg[0] = true;
9927ec681f3Smrg   Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
9937ec681f3Smrg   writeout(3, res3);
9947ec681f3Smrg
9957ec681f3Smrg   //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
9967ec681f3Smrg   //! p_unit_test 4, %res4
9977ec681f3Smrg   Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
9987ec681f3Smrg   auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
9997ec681f3Smrg   res4.instr->vop3().neg[0] = true;
10007ec681f3Smrg   writeout(4, res4);
10017ec681f3Smrg
10027ec681f3Smrg   //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1
10037ec681f3Smrg   //! v1: %res5 = v_add_f32 %tmp5, %b clamp
10047ec681f3Smrg   //! p_unit_test 5, %res5
10057ec681f3Smrg   Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
10067ec681f3Smrg   auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
10077ec681f3Smrg   res5.instr->vop3().clamp = true;
10087ec681f3Smrg   writeout(5, res5);
10097ec681f3Smrg
10107ec681f3Smrg   //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1
10117ec681f3Smrg   //! p_unit_test 6, %res6
10127ec681f3Smrg   auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
10137ec681f3Smrg   tmp6.instr->dpp().neg[0] = true;
10147ec681f3Smrg   auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
10157ec681f3Smrg   res6.instr->vop3().abs[0] = true;
10167ec681f3Smrg   writeout(6, res6);
10177ec681f3Smrg
10187ec681f3Smrg   //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1
10197ec681f3Smrg   //! p_unit_test 7, %res7
10207ec681f3Smrg   Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
10217ec681f3Smrg   auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
10227ec681f3Smrg   res7.instr->vop3().abs[0] = true;
10237ec681f3Smrg   writeout(7, res7);
10247ec681f3Smrg
10257ec681f3Smrg   /* vcc */
10267ec681f3Smrg   //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1
10277ec681f3Smrg   //! p_unit_test 8, %res8
10287ec681f3Smrg   Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
10297ec681f3Smrg   Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
10307ec681f3Smrg   writeout(8, res8);
10317ec681f3Smrg
10327ec681f3Smrg   /* sgprs */
10337ec681f3Smrg   //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1
10347ec681f3Smrg   //! v1: %res9 = v_add_f32 %tmp9, %d
10357ec681f3Smrg   //! p_unit_test 9, %res9
10367ec681f3Smrg   Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
10377ec681f3Smrg   Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
10387ec681f3Smrg   writeout(9, res9);
10397ec681f3Smrg
10407ec681f3Smrg   //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1
10417ec681f3Smrg   //! v1: %res10 = v_add_f32 %d, %tmp10
10427ec681f3Smrg   //! p_unit_test 10, %res10
10437ec681f3Smrg   Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
10447ec681f3Smrg   Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), d, tmp10);
10457ec681f3Smrg   writeout(10, res10);
10467ec681f3Smrg
10477ec681f3Smrg   finish_opt_test();
10487ec681f3SmrgEND_TEST
10497ec681f3Smrg
10507ec681f3SmrgBEGIN_TEST(optimize.dpp_prop)
10517ec681f3Smrg   //>> v1: %a, s1: %b = p_startpgm
10527ec681f3Smrg   if (!setup_cs("v1 s1", GFX10))
10537ec681f3Smrg      return;
10547ec681f3Smrg
10557ec681f3Smrg   //! v1: %one = p_parallelcopy 1
10567ec681f3Smrg   //! v1: %res0 = v_mul_f32 1, %a
10577ec681f3Smrg   //! p_unit_test 0, %res0
10587ec681f3Smrg   Temp one = bld.copy(bld.def(v1), Operand::c32(1));
10597ec681f3Smrg   writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
10607ec681f3Smrg
10617ec681f3Smrg   //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
10627ec681f3Smrg   //! p_unit_test 1, %res1
10637ec681f3Smrg   writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
10647ec681f3Smrg
10657ec681f3Smrg   //! v1: %res2 = v_mul_f32 0x12345678, %a
10667ec681f3Smrg   //! p_unit_test 2, %res2
10677ec681f3Smrg   Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
10687ec681f3Smrg   writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
10697ec681f3Smrg
10707ec681f3Smrg   //! v1: %literal2 = p_parallelcopy 0x12345679
10717ec681f3Smrg   //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
10727ec681f3Smrg   //! p_unit_test 3, %res3
10737ec681f3Smrg   Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
10747ec681f3Smrg   writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
10757ec681f3Smrg
10767ec681f3Smrg   //! v1: %b_v = p_parallelcopy %b
10777ec681f3Smrg   //! v1: %res4 = v_mul_f32 %b, %a
10787ec681f3Smrg   //! p_unit_test 4, %res4
10797ec681f3Smrg   Temp b_v = bld.copy(bld.def(v1), inputs[1]);
10807ec681f3Smrg   writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
10817ec681f3Smrg
10827ec681f3Smrg   //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
10837ec681f3Smrg   //! p_unit_test 5, %res5
10847ec681f3Smrg   writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
10857ec681f3Smrg
10867ec681f3Smrg   //! v1: %res6 = v_rcp_f32 %b
10877ec681f3Smrg   //! p_unit_test 6, %res6
10887ec681f3Smrg   writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
10897ec681f3Smrg
10907ec681f3Smrg   finish_opt_test();
10917ec681f3SmrgEND_TEST
10927ec681f3Smrg
1093