Lines Matching refs:uint
949 uint * restrict out = (uint* restrict)_out;
953 (out+j)[0] = (uint)(i);
961 uint * restrict out = (uint* restrict)_out;
965 (out+j)[0] = (uint)(i);
966 (out+j)[1] = (uint)(i+1);
974 uint * restrict out = (uint* restrict)_out;
978 (out+j)[0] = (uint)(i);
979 (out+j)[1] = (uint)(i+1);
987 uint * restrict out = (uint* restrict)_out;
992 (out+j)[0] = (uint)(i);
993 (out+j)[1] = (uint)(i+1);
996 (out+j)[0] = (uint)(end);
997 (out+j)[1] = (uint)(start);
1004 uint * restrict out = (uint* restrict)_out;
1008 (out+j)[0] = (uint)(i);
1009 (out+j)[1] = (uint)(i+1);
1010 (out+j)[2] = (uint)(i+2);
1018 uint * restrict out = (uint* restrict)_out;
1022 (out+j)[0] = (uint)(i);
1023 (out+j)[1] = (uint)(i+1+(i&1));
1024 (out+j)[2] = (uint)(i+2-(i&1));
1032 uint * restrict out = (uint* restrict)_out;
1036 (out+j)[0] = (uint)(i+1);
1037 (out+j)[1] = (uint)(i+2);
1038 (out+j)[2] = (uint)(start);
1046 uint * restrict out = (uint* restrict)_out;
1050 (out+j+0)[0] = (uint)(i+0);
1051 (out+j+0)[1] = (uint)(i+1);
1052 (out+j+0)[2] = (uint)(i+2);
1053 (out+j+3)[0] = (uint)(i+0);
1054 (out+j+3)[1] = (uint)(i+2);
1055 (out+j+3)[2] = (uint)(i+3);
1063 uint * restrict out = (uint* restrict)_out;
1067 (out+j+0)[0] = (uint)(i+0);
1068 (out+j+0)[1] = (uint)(i+1);
1069 (out+j+0)[2] = (uint)(i+3);
1070 (out+j+3)[0] = (uint)(i+0);
1071 (out+j+3)[1] = (uint)(i+3);
1072 (out+j+3)[2] = (uint)(i+2);
1080 uint * restrict out = (uint* restrict)_out;
1084 (out+j)[0] = (uint)(start);
1085 (out+j)[1] = (uint)(i+1);
1086 (out+j)[2] = (uint)(i+2);
1094 uint * restrict out = (uint* restrict)_out;
1098 (out+j)[0] = (uint)(i+0);
1099 (out+j)[1] = (uint)(i+1);
1100 (out+j)[2] = (uint)(i+2);
1101 (out+j)[3] = (uint)(i+3);
1109 uint * restrict out = (uint* restrict)_out;
1113 (out+j)[0] = (uint)(i+0);
1114 (out+j)[1] = (uint)(i+1);
1115 (out+j)[2] = (uint)(i+2);
1116 (out+j)[3] = (uint)(i+3);
1124 uint * restrict out = (uint* restrict)_out;
1128 (out+j)[0] = (uint)(i+0);
1129 (out+j)[1] = (uint)(i+1);
1130 (out+j)[2] = (uint)(i+2);
1131 (out+j)[3] = (uint)(i+3);
1132 (out+j)[4] = (uint)(i+4);
1133 (out+j)[5] = (uint)(i+5);
1141 uint * restrict out = (uint* restrict)_out;
1147 (out+j)[0] = (uint)(i+0);
1148 (out+j)[1] = (uint)(i+1);
1149 (out+j)[2] = (uint)(i+2);
1150 (out+j)[3] = (uint)(i+3);
1151 (out+j)[4] = (uint)(i+4);
1152 (out+j)[5] = (uint)(i+5);
1155 (out+j)[0] = (uint)(i+2);
1156 (out+j)[1] = (uint)(i-2);
1157 (out+j)[2] = (uint)(i+0);
1158 (out+j)[3] = (uint)(i+3);
1159 (out+j)[4] = (uint)(i+4);
1160 (out+j)[5] = (uint)(i+6);
1169 uint * restrict out = (uint* restrict)_out;
1173 (out+j)[0] = (uint)(i);
1181 uint * restrict out = (uint* restrict)_out;
1185 (out+j)[0] = (uint)(i+1);
1186 (out+j)[1] = (uint)(i);
1194 uint * restrict out = (uint* restrict)_out;
1198 (out+j)[0] = (uint)(i+1);
1199 (out+j)[1] = (uint)(i);
1207 uint * restrict out = (uint* restrict)_out;
1212 (out+j)[0] = (uint)(i+1);
1213 (out+j)[1] = (uint)(i);
1216 (out+j)[0] = (uint)(start);
1217 (out+j)[1] = (uint)(end);
1224 uint * restrict out = (uint* restrict)_out;
1228 (out+j)[0] = (uint)(i+1);
1229 (out+j)[1] = (uint)(i+2);
1230 (out+j)[2] = (uint)(i);
1238 uint * restrict out = (uint* restrict)_out;
1242 (out+j)[0] = (uint)(i+1+(i&1));
1243 (out+j)[1] = (uint)(i+2-(i&1));
1244 (out+j)[2] = (uint)(i);
1252 uint * restrict out = (uint* restrict)_out;
1256 (out+j)[0] = (uint)(i+2);
1257 (out+j)[1] = (uint)(start);
1258 (out+j)[2] = (uint)(i+1);
1266 uint * restrict out = (uint* restrict)_out;
1270 (out+j+0)[0] = (uint)(i+1);
1271 (out+j+0)[1] = (uint)(i+2);
1272 (out+j+0)[2] = (uint)(i+0);
1273 (out+j+3)[0] = (uint)(i+2);
1274 (out+j+3)[1] = (uint)(i+3);
1275 (out+j+3)[2] = (uint)(i+0);
1283 uint * restrict out = (uint* restrict)_out;
1287 (out+j+0)[0] = (uint)(i+1);
1288 (out+j+0)[1] = (uint)(i+3);
1289 (out+j+0)[2] = (uint)(i+0);
1290 (out+j+3)[0] = (uint)(i+3);
1291 (out+j+3)[1] = (uint)(i+2);
1292 (out+j+3)[2] = (uint)(i+0);
1300 uint * restrict out = (uint* restrict)_out;
1304 (out+j)[0] = (uint)(i+1);
1305 (out+j)[1] = (uint)(i+2);
1306 (out+j)[2] = (uint)(start);
1314 uint * restrict out = (uint* restrict)_out;
1318 (out+j)[0] = (uint)(i+3);
1319 (out+j)[1] = (uint)(i+2);
1320 (out+j)[2] = (uint)(i+1);
1321 (out+j)[3] = (uint)(i+0);
1329 uint * restrict out = (uint* restrict)_out;
1333 (out+j)[0] = (uint)(i+3);
1334 (out+j)[1] = (uint)(i+2);
1335 (out+j)[2] = (uint)(i+1);
1336 (out+j)[3] = (uint)(i+0);
1344 uint * restrict out = (uint* restrict)_out;
1348 (out+j)[0] = (uint)(i+4);
1349 (out+j)[1] = (uint)(i+5);
1350 (out+j)[2] = (uint)(i+0);
1351 (out+j)[3] = (uint)(i+1);
1352 (out+j)[4] = (uint)(i+2);
1353 (out+j)[5] = (uint)(i+3);
1361 uint * restrict out = (uint* restrict)_out;
1367 (out+j)[0] = (uint)(i+4);
1368 (out+j)[1] = (uint)(i+5);
1369 (out+j)[2] = (uint)(i+0);
1370 (out+j)[3] = (uint)(i+1);
1371 (out+j)[4] = (uint)(i+2);
1372 (out+j)[5] = (uint)(i+3);
1375 (out+j)[0] = (uint)(i+4);
1376 (out+j)[1] = (uint)(i+6);
1377 (out+j)[2] = (uint)(i+2);
1378 (out+j)[3] = (uint)(i-2);
1379 (out+j)[4] = (uint)(i+0);
1380 (out+j)[5] = (uint)(i+3);
1389 uint * restrict out = (uint* restrict)_out;
1393 (out+j)[0] = (uint)(i);
1401 uint * restrict out = (uint* restrict)_out;
1405 (out+j)[0] = (uint)(i+1);
1406 (out+j)[1] = (uint)(i);
1414 uint * restrict out = (uint* restrict)_out;
1418 (out+j)[0] = (uint)(i+1);
1419 (out+j)[1] = (uint)(i);
1427 uint * restrict out = (uint* restrict)_out;
1432 (out+j)[0] = (uint)(i+1);
1433 (out+j)[1] = (uint)(i);
1436 (out+j)[0] = (uint)(start);
1437 (out+j)[1] = (uint)(end);
1444 uint * restrict out = (uint* restrict)_out;
1448 (out+j)[0] = (uint)(i+2);
1449 (out+j)[1] = (uint)(i);
1450 (out+j)[2] = (uint)(i+1);
1458 uint * restrict out = (uint* restrict)_out;
1462 (out+j)[0] = (uint)(i+2);
1463 (out+j)[1] = (uint)(i+(i&1));
1464 (out+j)[2] = (uint)(i+1-(i&1));
1472 uint * restrict out = (uint* restrict)_out;
1476 (out+j)[0] = (uint)(i+2);
1477 (out+j)[1] = (uint)(start);
1478 (out+j)[2] = (uint)(i+1);
1486 uint * restrict out = (uint* restrict)_out;
1490 (out+j+0)[0] = (uint)(i+3);
1491 (out+j+0)[1] = (uint)(i+0);
1492 (out+j+0)[2] = (uint)(i+1);
1493 (out+j+3)[0] = (uint)(i+3);
1494 (out+j+3)[1] = (uint)(i+1);
1495 (out+j+3)[2] = (uint)(i+2);
1503 uint * restrict out = (uint* restrict)_out;
1507 (out+j+0)[0] = (uint)(i+3);
1508 (out+j+0)[1] = (uint)(i+2);
1509 (out+j+0)[2] = (uint)(i+0);
1510 (out+j+3)[0] = (uint)(i+3);
1511 (out+j+3)[1] = (uint)(i+0);
1512 (out+j+3)[2] = (uint)(i+1);
1520 uint * restrict out = (uint* restrict)_out;
1524 (out+j)[0] = (uint)(start);
1525 (out+j)[1] = (uint)(i+1);
1526 (out+j)[2] = (uint)(i+2);
1534 uint * restrict out = (uint* restrict)_out;
1538 (out+j)[0] = (uint)(i+3);
1539 (out+j)[1] = (uint)(i+2);
1540 (out+j)[2] = (uint)(i+1);
1541 (out+j)[3] = (uint)(i+0);
1549 uint * restrict out = (uint* restrict)_out;
1553 (out+j)[0] = (uint)(i+3);
1554 (out+j)[1] = (uint)(i+2);
1555 (out+j)[2] = (uint)(i+1);
1556 (out+j)[3] = (uint)(i+0);
1564 uint * restrict out = (uint* restrict)_out;
1568 (out+j)[0] = (uint)(i+4);
1569 (out+j)[1] = (uint)(i+5);
1570 (out+j)[2] = (uint)(i+0);
1571 (out+j)[3] = (uint)(i+1);
1572 (out+j)[4] = (uint)(i+2);
1573 (out+j)[5] = (uint)(i+3);
1581 uint * restrict out = (uint* restrict)_out;
1587 (out+j)[0] = (uint)(i+4);
1588 (out+j)[1] = (uint)(i+5);
1589 (out+j)[2] = (uint)(i+0);
1590 (out+j)[3] = (uint)(i+1);
1591 (out+j)[4] = (uint)(i+2);
1592 (out+j)[5] = (uint)(i+3);
1595 (out+j)[0] = (uint)(i+4);
1596 (out+j)[1] = (uint)(i+6);
1597 (out+j)[2] = (uint)(i+2);
1598 (out+j)[3] = (uint)(i-2);
1599 (out+j)[4] = (uint)(i+0);
1600 (out+j)[5] = (uint)(i+3);
1609 uint * restrict out = (uint* restrict)_out;
1613 (out+j)[0] = (uint)(i);
1621 uint * restrict out = (uint* restrict)_out;
1625 (out+j)[0] = (uint)(i);
1626 (out+j)[1] = (uint)(i+1);
1634 uint * restrict out = (uint* restrict)_out;
1638 (out+j)[0] = (uint)(i);
1639 (out+j)[1] = (uint)(i+1);
1647 uint * restrict out = (uint* restrict)_out;
1652 (out+j)[0] = (uint)(i);
1653 (out+j)[1] = (uint)(i+1);
1656 (out+j)[0] = (uint)(end);
1657 (out+j)[1] = (uint)(start);
1664 uint * restrict out = (uint* restrict)_out;
1668 (out+j)[0] = (uint)(i);
1669 (out+j)[1] = (uint)(i+1);
1670 (out+j)[2] = (uint)(i+2);
1678 uint * restrict out = (uint* restrict)_out;
1682 (out+j)[0] = (uint)(i+(i&1));
1683 (out+j)[1] = (uint)(i+1-(i&1));
1684 (out+j)[2] = (uint)(i+2);
1692 uint * restrict out = (uint* restrict)_out;
1696 (out+j)[0] = (uint)(start);
1697 (out+j)[1] = (uint)(i+1);
1698 (out+j)[2] = (uint)(i+2);
1706 uint * restrict out = (uint* restrict)_out;
1710 (out+j+0)[0] = (uint)(i+0);
1711 (out+j+0)[1] = (uint)(i+1);
1712 (out+j+0)[2] = (uint)(i+3);
1713 (out+j+3)[0] = (uint)(i+1);
1714 (out+j+3)[1] = (uint)(i+2);
1715 (out+j+3)[2] = (uint)(i+3);
1723 uint * restrict out = (uint* restrict)_out;
1727 (out+j+0)[0] = (uint)(i+2);
1728 (out+j+0)[1] = (uint)(i+0);
1729 (out+j+0)[2] = (uint)(i+3);
1730 (out+j+3)[0] = (uint)(i+0);
1731 (out+j+3)[1] = (uint)(i+1);
1732 (out+j+3)[2] = (uint)(i+3);
1740 uint * restrict out = (uint* restrict)_out;
1744 (out+j)[0] = (uint)(i+1);
1745 (out+j)[1] = (uint)(i+2);
1746 (out+j)[2] = (uint)(start);
1754 uint * restrict out = (uint* restrict)_out;
1758 (out+j)[0] = (uint)(i+0);
1759 (out+j)[1] = (uint)(i+1);
1760 (out+j)[2] = (uint)(i+2);
1761 (out+j)[3] = (uint)(i+3);
1769 uint * restrict out = (uint* restrict)_out;
1773 (out+j)[0] = (uint)(i+0);
1774 (out+j)[1] = (uint)(i+1);
1775 (out+j)[2] = (uint)(i+2);
1776 (out+j)[3] = (uint)(i+3);
1784 uint * restrict out = (uint* restrict)_out;
1788 (out+j)[0] = (uint)(i+0);
1789 (out+j)[1] = (uint)(i+1);
1790 (out+j)[2] = (uint)(i+2);
1791 (out+j)[3] = (uint)(i+3);
1792 (out+j)[4] = (uint)(i+4);
1793 (out+j)[5] = (uint)(i+5);
1801 uint * restrict out = (uint* restrict)_out;
1807 (out+j)[0] = (uint)(i+0);
1808 (out+j)[1] = (uint)(i+1);
1809 (out+j)[2] = (uint)(i+2);
1810 (out+j)[3] = (uint)(i+3);
1811 (out+j)[4] = (uint)(i+4);
1812 (out+j)[5] = (uint)(i+5);
1815 (out+j)[0] = (uint)(i+2);
1816 (out+j)[1] = (uint)(i-2);
1817 (out+j)[2] = (uint)(i+0);
1818 (out+j)[3] = (uint)(i+3);
1819 (out+j)[4] = (uint)(i+4);
1820 (out+j)[5] = (uint)(i+6);
4521 uint * restrict out = (uint* restrict)_out;
4525 (out+j)[0] = (uint)in[i];
4537 uint * restrict out = (uint* restrict)_out;
4541 (out+j)[0] = (uint)in[i];
4542 (out+j)[1] = (uint)in[i+1];
4554 uint * restrict out = (uint* restrict)_out;
4558 (out+j)[0] = (uint)in[i];
4559 (out+j)[1] = (uint)in[i+1];
4571 uint * restrict out = (uint* restrict)_out;
4576 (out+j)[0] = (uint)in[i];
4577 (out+j)[1] = (uint)in[i+1];
4580 (out+j)[0] = (uint)in[end];
4581 (out+j)[1] = (uint)in[start];
4592 uint * restrict out = (uint* restrict)_out;
4596 (out+j)[0] = (uint)in[i];
4597 (out+j)[1] = (uint)in[i+1];
4598 (out+j)[2] = (uint)in[i+2];
4610 uint * restrict out = (uint* restrict)_out;
4614 (out+j)[0] = (uint)in[i];
4615 (out+j)[1] = (uint)in[i+1+(i&1)];
4616 (out+j)[2] = (uint)in[i+2-(i&1)];
4628 uint * restrict out = (uint* restrict)_out;
4632 (out+j)[0] = (uint)in[i+1];
4633 (out+j)[1] = (uint)in[i+2];
4634 (out+j)[2] = (uint)in[start];
4646 uint * restrict out = (uint* restrict)_out;
4650 (out+j+0)[0] = (uint)in[i+0];
4651 (out+j+0)[1] = (uint)in[i+1];
4652 (out+j+0)[2] = (uint)in[i+2];
4653 (out+j+3)[0] = (uint)in[i+0];
4654 (out+j+3)[1] = (uint)in[i+2];
4655 (out+j+3)[2] = (uint)in[i+3];
4667 uint * restrict out = (uint* restrict)_out;
4671 (out+j+0)[0] = (uint)in[i+0];
4672 (out+j+0)[1] = (uint)in[i+1];
4673 (out+j+0)[2] = (uint)in[i+3];
4674 (out+j+3)[0] = (uint)in[i+0];
4675 (out+j+3)[1] = (uint)in[i+3];
4676 (out+j+3)[2] = (uint)in[i+2];
4688 uint * restrict out = (uint* restrict)_out;
4692 (out+j)[0] = (uint)in[start];
4693 (out+j)[1] = (uint)in[i+1];
4694 (out+j)[2] = (uint)in[i+2];
4706 uint * restrict out = (uint* restrict)_out;
4710 (out+j)[0] = (uint)in[i+0];
4711 (out+j)[1] = (uint)in[i+1];
4712 (out+j)[2] = (uint)in[i+2];
4713 (out+j)[3] = (uint)in[i+3];
4725 uint * restrict out = (uint* restrict)_out;
4729 (out+j)[0] = (uint)in[i+0];
4730 (out+j)[1] = (uint)in[i+1];
4731 (out+j)[2] = (uint)in[i+2];
4732 (out+j)[3] = (uint)in[i+3];
4744 uint * restrict out = (uint* restrict)_out;
4748 (out+j)[0] = (uint)in[i+0];
4749 (out+j)[1] = (uint)in[i+1];
4750 (out+j)[2] = (uint)in[i+2];
4751 (out+j)[3] = (uint)in[i+3];
4752 (out+j)[4] = (uint)in[i+4];
4753 (out+j)[5] = (uint)in[i+5];
4765 uint * restrict out = (uint* restrict)_out;
4771 (out+j)[0] = (uint)in[i+0];
4772 (out+j)[1] = (uint)in[i+1];
4773 (out+j)[2] = (uint)in[i+2];
4774 (out+j)[3] = (uint)in[i+3];
4775 (out+j)[4] = (uint)in[i+4];
4776 (out+j)[5] = (uint)in[i+5];
4779 (out+j)[0] = (uint)in[i+2];
4780 (out+j)[1] = (uint)in[i-2];
4781 (out+j)[2] = (uint)in[i+0];
4782 (out+j)[3] = (uint)in[i+3];
4783 (out+j)[4] = (uint)in[i+4];
4784 (out+j)[5] = (uint)in[i+6];
4797 uint * restrict out = (uint* restrict)_out;
4801 (out+j)[0] = (uint)in[i];
4813 uint * restrict out = (uint* restrict)_out;
4817 (out+j)[0] = (uint)in[i];
4818 (out+j)[1] = (uint)in[i+1];
4830 uint * restrict out = (uint* restrict)_out;
4834 (out+j)[0] = (uint)in[i];
4835 (out+j)[1] = (uint)in[i+1];
4847 uint * restrict out = (uint* restrict)_out;
4860 (out+j)[0] = (uint)in[end];
4861 (out+j)[1] = (uint)in[start];
4869 (out+j)[0] = (uint)in[end];
4870 (out+j)[1] = (uint)in[start];
4876 (out+j)[0] = (uint)in[i];
4877 (out+j)[1] = (uint)in[i+1];
4880 (out+j)[0] = (uint)in[end];
4881 (out+j)[1] = (uint)in[start];
4892 uint * restrict out = (uint* restrict)_out;
4896 (out+j)[0] = (uint)in[i];
4897 (out+j)[1] = (uint)in[i+1];
4898 (out+j)[2] = (uint)in[i+2];
4910 uint * restrict out = (uint* restrict)_out;
4914 (out+j)[0] = (uint)in[i];
4915 (out+j)[1] = (uint)in[i+1+(i&1)];
4916 (out+j)[2] = (uint)in[i+2-(i&1)];
4928 uint * restrict out = (uint* restrict)_out;
4954 (out+j)[0] = (uint)in[i+1];
4955 (out+j)[1] = (uint)in[i+2];
4956 (out+j)[2] = (uint)in[start];
4968 uint * restrict out = (uint* restrict)_out;
4998 (out+j+0)[0] = (uint)in[i+0];
4999 (out+j+0)[1] = (uint)in[i+1];
5000 (out+j+0)[2] = (uint)in[i+2];
5001 (out+j+3)[0] = (uint)in[i+0];
5002 (out+j+3)[1] = (uint)in[i+2];
5003 (out+j+3)[2] = (uint)in[i+3];
5015 uint * restrict out = (uint* restrict)_out;
5045 (out+j+0)[0] = (uint)in[i+0];
5046 (out+j+0)[1] = (uint)in[i+1];
5047 (out+j+0)[2] = (uint)in[i+3];
5048 (out+j+3)[0] = (uint)in[i+0];
5049 (out+j+3)[1] = (uint)in[i+3];
5050 (out+j+3)[2] = (uint)in[i+2];
5062 uint * restrict out = (uint* restrict)_out;
5088 (out+j)[0] = (uint)in[start];
5089 (out+j)[1] = (uint)in[i+1];
5090 (out+j)[2] = (uint)in[i+2];
5102 uint * restrict out = (uint* restrict)_out;
5106 (out+j)[0] = (uint)in[i+0];
5107 (out+j)[1] = (uint)in[i+1];
5108 (out+j)[2] = (uint)in[i+2];
5109 (out+j)[3] = (uint)in[i+3];
5121 uint * restrict out = (uint* restrict)_out;
5125 (out+j)[0] = (uint)in[i+0];
5126 (out+j)[1] = (uint)in[i+1];
5127 (out+j)[2] = (uint)in[i+2];
5128 (out+j)[3] = (uint)in[i+3];
5140 uint * restrict out = (uint* restrict)_out;
5144 (out+j)[0] = (uint)in[i+0];
5145 (out+j)[1] = (uint)in[i+1];
5146 (out+j)[2] = (uint)in[i+2];
5147 (out+j)[3] = (uint)in[i+3];
5148 (out+j)[4] = (uint)in[i+4];
5149 (out+j)[5] = (uint)in[i+5];
5161 uint * restrict out = (uint* restrict)_out;
5167 (out+j)[0] = (uint)in[i+0];
5168 (out+j)[1] = (uint)in[i+1];
5169 (out+j)[2] = (uint)in[i+2];
5170 (out+j)[3] = (uint)in[i+3];
5171 (out+j)[4] = (uint)in[i+4];
5172 (out+j)[5] = (uint)in[i+5];
5175 (out+j)[0] = (uint)in[i+2];
5176 (out+j)[1] = (uint)in[i-2];
5177 (out+j)[2] = (uint)in[i+0];
5178 (out+j)[3] = (uint)in[i+3];
5179 (out+j)[4] = (uint)in[i+4];
5180 (out+j)[5] = (uint)in[i+6];
5193 uint * restrict out = (uint* restrict)_out;
5197 (out+j)[0] = (uint)in[i];
5209 uint * restrict out = (uint* restrict)_out;
5213 (out+j)[0] = (uint)in[i+1];
5214 (out+j)[1] = (uint)in[i];
5226 uint * restrict out = (uint* restrict)_out;
5230 (out+j)[0] = (uint)in[i+1];
5231 (out+j)[1] = (uint)in[i];
5243 uint * restrict out = (uint* restrict)_out;
5248 (out+j)[0] = (uint)in[i+1];
5249 (out+j)[1] = (uint)in[i];
5252 (out+j)[0] = (uint)in[start];
5253 (out+j)[1] = (uint)in[end];
5264 uint * restrict out = (uint* restrict)_out;
5268 (out+j)[0] = (uint)in[i+1];
5269 (out+j)[1] = (uint)in[i+2];
5270 (out+j)[2] = (uint)in[i];
5282 uint * restrict out = (uint* restrict)_out;
5286 (out+j)[0] = (uint)in[i+1+(i&1)];
5287 (out+j)[1] = (uint)in[i+2-(i&1)];
5288 (out+j)[2] = (uint)in[i];
5300 uint * restrict out = (uint* restrict)_out;
5304 (out+j)[0] = (uint)in[i+2];
5305 (out+j)[1] = (uint)in[start];
5306 (out+j)[2] = (uint)in[i+1];
5318 uint * restrict out = (uint* restrict)_out;
5322 (out+j+0)[0] = (uint)in[i+1];
5323 (out+j+0)[1] = (uint)in[i+2];
5324 (out+j+0)[2] = (uint)in[i+0];
5325 (out+j+3)[0] = (uint)in[i+2];
5326 (out+j+3)[1] = (uint)in[i+3];
5327 (out+j+3)[2] = (uint)in[i+0];
5339 uint * restrict out = (uint* restrict)_out;
5343 (out+j+0)[0] = (uint)in[i+1];
5344 (out+j+0)[1] = (uint)in[i+3];
5345 (out+j+0)[2] = (uint)in[i+0];
5346 (out+j+3)[0] = (uint)in[i+3];
5347 (out+j+3)[1] = (uint)in[i+2];
5348 (out+j+3)[2] = (uint)in[i+0];
5360 uint * restrict out = (uint* restrict)_out;
5364 (out+j)[0] = (uint)in[i+1];
5365 (out+j)[1] = (uint)in[i+2];
5366 (out+j)[2] = (uint)in[start];
5378 uint * restrict out = (uint* restrict)_out;
5382 (out+j)[0] = (uint)in[i+3];
5383 (out+j)[1] = (uint)in[i+2];
5384 (out+j)[2] = (uint)in[i+1];
5385 (out+j)[3] = (uint)in[i+0];
5397 uint * restrict out = (uint* restrict)_out;
5401 (out+j)[0] = (uint)in[i+3];
5402 (out+j)[1] = (uint)in[i+2];
5403 (out+j)[2] = (uint)in[i+1];
5404 (out+j)[3] = (uint)in[i+0];
5416 uint * restrict out = (uint* restrict)_out;
5420 (out+j)[0] = (uint)in[i+4];
5421 (out+j)[1] = (uint)in[i+5];
5422 (out+j)[2] = (uint)in[i+0];
5423 (out+j)[3] = (uint)in[i+1];
5424 (out+j)[4] = (uint)in[i+2];
5425 (out+j)[5] = (uint)in[i+3];
5437 uint * restrict out = (uint* restrict)_out;
5443 (out+j)[0] = (uint)in[i+4];
5444 (out+j)[1] = (uint)in[i+5];
5445 (out+j)[2] = (uint)in[i+0];
5446 (out+j)[3] = (uint)in[i+1];
5447 (out+j)[4] = (uint)in[i+2];
5448 (out+j)[5] = (uint)in[i+3];
5451 (out+j)[0] = (uint)in[i+4];
5452 (out+j)[1] = (uint)in[i+6];
5453 (out+j)[2] = (uint)in[i+2];
5454 (out+j)[3] = (uint)in[i-2];
5455 (out+j)[4] = (uint)in[i+0];
5456 (out+j)[5] = (uint)in[i+3];
5469 uint * restrict out = (uint* restrict)_out;
5473 (out+j)[0] = (uint)in[i];
5485 uint * restrict out = (uint* restrict)_out;
5489 (out+j)[0] = (uint)in[i+1];
5490 (out+j)[1] = (uint)in[i];
5502 uint * restrict out = (uint* restrict)_out;
5506 (out+j)[0] = (uint)in[i+1];
5507 (out+j)[1] = (uint)in[i];
5519 uint * restrict out = (uint* restrict)_out;
5532 (out+j)[0] = (uint)in[start];
5533 (out+j)[1] = (uint)in[end];
5541 (out+j)[0] = (uint)in[start];
5542 (out+j)[1] = (uint)in[end];
5548 (out+j)[0] = (uint)in[i+1];
5549 (out+j)[1] = (uint)in[i];
5552 (out+j)[0] = (uint)in[start];
5553 (out+j)[1] = (uint)in[end];
5564 uint * restrict out = (uint* restrict)_out;
5568 (out+j)[0] = (uint)in[i+1];
5569 (out+j)[1] = (uint)in[i+2];
5570 (out+j)[2] = (uint)in[i];
5582 uint * restrict out = (uint* restrict)_out;
5586 (out+j)[0] = (uint)in[i+1+(i&1)];
5587 (out+j)[1] = (uint)in[i+2-(i&1)];
5588 (out+j)[2] = (uint)in[i];
5600 uint * restrict out = (uint* restrict)_out;
5626 (out+j)[0] = (uint)in[i+2];
5627 (out+j)[1] = (uint)in[start];
5628 (out+j)[2] = (uint)in[i+1];
5640 uint * restrict out = (uint* restrict)_out;
5670 (out+j+0)[0] = (uint)in[i+1];
5671 (out+j+0)[1] = (uint)in[i+2];
5672 (out+j+0)[2] = (uint)in[i+0];
5673 (out+j+3)[0] = (uint)in[i+2];
5674 (out+j+3)[1] = (uint)in[i+3];
5675 (out+j+3)[2] = (uint)in[i+0];
5687 uint * restrict out = (uint* restrict)_out;
5717 (out+j+0)[0] = (uint)in[i+1];
5718 (out+j+0)[1] = (uint)in[i+3];
5719 (out+j+0)[2] = (uint)in[i+0];
5720 (out+j+3)[0] = (uint)in[i+3];
5721 (out+j+3)[1] = (uint)in[i+2];
5722 (out+j+3)[2] = (uint)in[i+0];
5734 uint * restrict out = (uint* restrict)_out;
5760 (out+j)[0] = (uint)in[i+1];
5761 (out+j)[1] = (uint)in[i+2];
5762 (out+j)[2] = (uint)in[start];
5774 uint * restrict out = (uint* restrict)_out;
5778 (out+j)[0] = (uint)in[i+3];
5779 (out+j)[1] = (uint)in[i+2];
5780 (out+j)[2] = (uint)in[i+1];
5781 (out+j)[3] = (uint)in[i+0];
5793 uint * restrict out = (uint* restrict)_out;
5797 (out+j)[0] = (uint)in[i+3];
5798 (out+j)[1] = (uint)in[i+2];
5799 (out+j)[2] = (uint)in[i+1];
5800 (out+j)[3] = (uint)in[i+0];
5812 uint * restrict out = (uint* restrict)_out;
5816 (out+j)[0] = (uint)in[i+4];
5817 (out+j)[1] = (uint)in[i+5];
5818 (out+j)[2] = (uint)in[i+0];
5819 (out+j)[3] = (uint)in[i+1];
5820 (out+j)[4] = (uint)in[i+2];
5821 (out+j)[5] = (uint)in[i+3];
5833 uint * restrict out = (uint* restrict)_out;
5839 (out+j)[0] = (uint)in[i+4];
5840 (out+j)[1] = (uint)in[i+5];
5841 (out+j)[2] = (uint)in[i+0];
5842 (out+j)[3] = (uint)in[i+1];
5843 (out+j)[4] = (uint)in[i+2];
5844 (out+j)[5] = (uint)in[i+3];
5847 (out+j)[0] = (uint)in[i+4];
5848 (out+j)[1] = (uint)in[i+6];
5849 (out+j)[2] = (uint)in[i+2];
5850 (out+j)[3] = (uint)in[i-2];
5851 (out+j)[4] = (uint)in[i+0];
5852 (out+j)[5] = (uint)in[i+3];
5865 uint * restrict out = (uint* restrict)_out;
5869 (out+j)[0] = (uint)in[i];
5881 uint * restrict out = (uint* restrict)_out;
5885 (out+j)[0] = (uint)in[i+1];
5886 (out+j)[1] = (uint)in[i];
5898 uint * restrict out = (uint* restrict)_out;
5902 (out+j)[0] = (uint)in[i+1];
5903 (out+j)[1] = (uint)in[i];
5915 uint * restrict out = (uint* restrict)_out;
5920 (out+j)[0] = (uint)in[i+1];
5921 (out+j)[1] = (uint)in[i];
5924 (out+j)[0] = (uint)in[start];
5925 (out+j)[1] = (uint)in[end];
5936 uint * restrict out = (uint* restrict)_out;
5940 (out+j)[0] = (uint)in[i+2];
5941 (out+j)[1] = (uint)in[i];
5942 (out+j)[2] = (uint)in[i+1];
5954 uint * restrict out = (uint* restrict)_out;
5958 (out+j)[0] = (uint)in[i+2];
5959 (out+j)[1] = (uint)in[i+(i&1)];
5960 (out+j)[2] = (uint)in[i+1-(i&1)];
5972 uint * restrict out = (uint* restrict)_out;
5976 (out+j)[0] = (uint)in[i+2];
5977 (out+j)[1] = (uint)in[start];
5978 (out+j)[2] = (uint)in[i+1];
5990 uint * restrict out = (uint* restrict)_out;
5994 (out+j+0)[0] = (uint)in[i+3];
5995 (out+j+0)[1] = (uint)in[i+0];
5996 (out+j+0)[2] = (uint)in[i+1];
5997 (out+j+3)[0] = (uint)in[i+3];
5998 (out+j+3)[1] = (uint)in[i+1];
5999 (out+j+3)[2] = (uint)in[i+2];
6011 uint * restrict out = (uint* restrict)_out;
6015 (out+j+0)[0] = (uint)in[i+3];
6016 (out+j+0)[1] = (uint)in[i+2];
6017 (out+j+0)[2] = (uint)in[i+0];
6018 (out+j+3)[0] = (uint)in[i+3];
6019 (out+j+3)[1] = (uint)in[i+0];
6020 (out+j+3)[2] = (uint)in[i+1];
6032 uint * restrict out = (uint* restrict)_out;
6036 (out+j)[0] = (uint)in[start];
6037 (out+j)[1] = (uint)in[i+1];
6038 (out+j)[2] = (uint)in[i+2];
6050 uint * restrict out = (uint* restrict)_out;
6054 (out+j)[0] = (uint)in[i+3];
6055 (out+j)[1] = (uint)in[i+2];
6056 (out+j)[2] = (uint)in[i+1];
6057 (out+j)[3] = (uint)in[i+0];
6069 uint * restrict out = (uint* restrict)_out;
6073 (out+j)[0] = (uint)in[i+3];
6074 (out+j)[1] = (uint)in[i+2];
6075 (out+j)[2] = (uint)in[i+1];
6076 (out+j)[3] = (uint)in[i+0];
6088 uint * restrict out = (uint* restrict)_out;
6092 (out+j)[0] = (uint)in[i+4];
6093 (out+j)[1] = (uint)in[i+5];
6094 (out+j)[2] = (uint)in[i+0];
6095 (out+j)[3] = (uint)in[i+1];
6096 (out+j)[4] = (uint)in[i+2];
6097 (out+j)[5] = (uint)in[i+3];
6109 uint * restrict out = (uint* restrict)_out;
6115 (out+j)[0] = (uint)in[i+4];
6116 (out+j)[1] = (uint)in[i+5];
6117 (out+j)[2] = (uint)in[i+0];
6118 (out+j)[3] = (uint)in[i+1];
6119 (out+j)[4] = (uint)in[i+2];
6120 (out+j)[5] = (uint)in[i+3];
6123 (out+j)[0] = (uint)in[i+4];
6124 (out+j)[1] = (uint)in[i+6];
6125 (out+j)[2] = (uint)in[i+2];
6126 (out+j)[3] = (uint)in[i-2];
6127 (out+j)[4] = (uint)in[i+0];
6128 (out+j)[5] = (uint)in[i+3];
6141 uint * restrict out = (uint* restrict)_out;
6145 (out+j)[0] = (uint)in[i];
6157 uint * restrict out = (uint* restrict)_out;
6161 (out+j)[0] = (uint)in[i+1];
6162 (out+j)[1] = (uint)in[i];
6174 uint * restrict out = (uint* restrict)_out;
6178 (out+j)[0] = (uint)in[i+1];
6179 (out+j)[1] = (uint)in[i];
6191 uint * restrict out = (uint* restrict)_out;
6204 (out+j)[0] = (uint)in[start];
6205 (out+j)[1] = (uint)in[end];
6213 (out+j)[0] = (uint)in[start];
6214 (out+j)[1] = (uint)in[end];
6220 (out+j)[0] = (uint)in[i+1];
6221 (out+j)[1] = (uint)in[i];
6224 (out+j)[0] = (uint)in[start];
6225 (out+j)[1] = (uint)in[end];
6236 uint * restrict out = (uint* restrict)_out;
6240 (out+j)[0] = (uint)in[i+2];
6241 (out+j)[1] = (uint)in[i];
6242 (out+j)[2] = (uint)in[i+1];
6254 uint * restrict out = (uint* restrict)_out;
6258 (out+j)[0] = (uint)in[i+2];
6259 (out+j)[1] = (uint)in[i+(i&1)];
6260 (out+j)[2] = (uint)in[i+1-(i&1)];
6272 uint * restrict out = (uint* restrict)_out;
6298 (out+j)[0] = (uint)in[i+2];
6299 (out+j)[1] = (uint)in[start];
6300 (out+j)[2] = (uint)in[i+1];
6312 uint * restrict out = (uint* restrict)_out;
6342 (out+j+0)[0] = (uint)in[i+3];
6343 (out+j+0)[1] = (uint)in[i+0];
6344 (out+j+0)[2] = (uint)in[i+1];
6345 (out+j+3)[0] = (uint)in[i+3];
6346 (out+j+3)[1] = (uint)in[i+1];
6347 (out+j+3)[2] = (uint)in[i+2];
6359 uint * restrict out = (uint* restrict)_out;
6389 (out+j+0)[0] = (uint)in[i+3];
6390 (out+j+0)[1] = (uint)in[i+2];
6391 (out+j+0)[2] = (uint)in[i+0];
6392 (out+j+3)[0] = (uint)in[i+3];
6393 (out+j+3)[1] = (uint)in[i+0];
6394 (out+j+3)[2] = (uint)in[i+1];
6406 uint * restrict out = (uint* restrict)_out;
6432 (out+j)[0] = (uint)in[start];
6433 (out+j)[1] = (uint)in[i+1];
6434 (out+j)[2] = (uint)in[i+2];
6446 uint * restrict out = (uint* restrict)_out;
6450 (out+j)[0] = (uint)in[i+3];
6451 (out+j)[1] = (uint)in[i+2];
6452 (out+j)[2] = (uint)in[i+1];
6453 (out+j)[3] = (uint)in[i+0];
6465 uint * restrict out = (uint* restrict)_out;
6469 (out+j)[0] = (uint)in[i+3];
6470 (out+j)[1] = (uint)in[i+2];
6471 (out+j)[2] = (uint)in[i+1];
6472 (out+j)[3] = (uint)in[i+0];
6484 uint * restrict out = (uint* restrict)_out;
6488 (out+j)[0] = (uint)in[i+4];
6489 (out+j)[1] = (uint)in[i+5];
6490 (out+j)[2] = (uint)in[i+0];
6491 (out+j)[3] = (uint)in[i+1];
6492 (out+j)[4] = (uint)in[i+2];
6493 (out+j)[5] = (uint)in[i+3];
6505 uint * restrict out = (uint* restrict)_out;
6511 (out+j)[0] = (uint)in[i+4];
6512 (out+j)[1] = (uint)in[i+5];
6513 (out+j)[2] = (uint)in[i+0];
6514 (out+j)[3] = (uint)in[i+1];
6515 (out+j)[4] = (uint)in[i+2];
6516 (out+j)[5] = (uint)in[i+3];
6519 (out+j)[0] = (uint)in[i+4];
6520 (out+j)[1] = (uint)in[i+6];
6521 (out+j)[2] = (uint)in[i+2];
6522 (out+j)[3] = (uint)in[i-2];
6523 (out+j)[4] = (uint)in[i+0];
6524 (out+j)[5] = (uint)in[i+3];
6537 uint * restrict out = (uint* restrict)_out;
6541 (out+j)[0] = (uint)in[i];
6553 uint * restrict out = (uint* restrict)_out;
6557 (out+j)[0] = (uint)in[i];
6558 (out+j)[1] = (uint)in[i+1];
6570 uint * restrict out = (uint* restrict)_out;
6574 (out+j)[0] = (uint)in[i];
6575 (out+j)[1] = (uint)in[i+1];
6587 uint * restrict out = (uint* restrict)_out;
6592 (out+j)[0] = (uint)in[i];
6593 (out+j)[1] = (uint)in[i+1];
6596 (out+j)[0] = (uint)in[end];
6597 (out+j)[1] = (uint)in[start];
6608 uint * restrict out = (uint* restrict)_out;
6612 (out+j)[0] = (uint)in[i];
6613 (out+j)[1] = (uint)in[i+1];
6614 (out+j)[2] = (uint)in[i+2];
6626 uint * restrict out = (uint* restrict)_out;
6630 (out+j)[0] = (uint)in[i+(i&1)];
6631 (out+j)[1] = (uint)in[i+1-(i&1)];
6632 (out+j)[2] = (uint)in[i+2];
6644 uint * restrict out = (uint* restrict)_out;
6648 (out+j)[0] = (uint)in[start];
6649 (out+j)[1] = (uint)in[i+1];
6650 (out+j)[2] = (uint)in[i+2];
6662 uint * restrict out = (uint* restrict)_out;
6666 (out+j+0)[0] = (uint)in[i+0];
6667 (out+j+0)[1] = (uint)in[i+1];
6668 (out+j+0)[2] = (uint)in[i+3];
6669 (out+j+3)[0] = (uint)in[i+1];
6670 (out+j+3)[1] = (uint)in[i+2];
6671 (out+j+3)[2] = (uint)in[i+3];
6683 uint * restrict out = (uint* restrict)_out;
6687 (out+j+0)[0] = (uint)in[i+2];
6688 (out+j+0)[1] = (uint)in[i+0];
6689 (out+j+0)[2] = (uint)in[i+3];
6690 (out+j+3)[0] = (uint)in[i+0];
6691 (out+j+3)[1] = (uint)in[i+1];
6692 (out+j+3)[2] = (uint)in[i+3];
6704 uint * restrict out = (uint* restrict)_out;
6708 (out+j)[0] = (uint)in[i+1];
6709 (out+j)[1] = (uint)in[i+2];
6710 (out+j)[2] = (uint)in[start];
6722 uint * restrict out = (uint* restrict)_out;
6726 (out+j)[0] = (uint)in[i+0];
6727 (out+j)[1] = (uint)in[i+1];
6728 (out+j)[2] = (uint)in[i+2];
6729 (out+j)[3] = (uint)in[i+3];
6741 uint * restrict out = (uint* restrict)_out;
6745 (out+j)[0] = (uint)in[i+0];
6746 (out+j)[1] = (uint)in[i+1];
6747 (out+j)[2] = (uint)in[i+2];
6748 (out+j)[3] = (uint)in[i+3];
6760 uint * restrict out = (uint* restrict)_out;
6764 (out+j)[0] = (uint)in[i+0];
6765 (out+j)[1] = (uint)in[i+1];
6766 (out+j)[2] = (uint)in[i+2];
6767 (out+j)[3] = (uint)in[i+3];
6768 (out+j)[4] = (uint)in[i+4];
6769 (out+j)[5] = (uint)in[i+5];
6781 uint * restrict out = (uint* restrict)_out;
6787 (out+j)[0] = (uint)in[i+0];
6788 (out+j)[1] = (uint)in[i+1];
6789 (out+j)[2] = (uint)in[i+2];
6790 (out+j)[3] = (uint)in[i+3];
6791 (out+j)[4] = (uint)in[i+4];
6792 (out+j)[5] = (uint)in[i+5];
6795 (out+j)[0] = (uint)in[i+2];
6796 (out+j)[1] = (uint)in[i-2];
6797 (out+j)[2] = (uint)in[i+0];
6798 (out+j)[3] = (uint)in[i+3];
6799 (out+j)[4] = (uint)in[i+4];
6800 (out+j)[5] = (uint)in[i+6];
6813 uint * restrict out = (uint* restrict)_out;
6817 (out+j)[0] = (uint)in[i];
6829 uint * restrict out = (uint* restrict)_out;
6833 (out+j)[0] = (uint)in[i];
6834 (out+j)[1] = (uint)in[i+1];
6846 uint * restrict out = (uint* restrict)_out;
6850 (out+j)[0] = (uint)in[i];
6851 (out+j)[1] = (uint)in[i+1];
6863 uint * restrict out = (uint* restrict)_out;
6876 (out+j)[0] = (uint)in[end];
6877 (out+j)[1] = (uint)in[start];
6885 (out+j)[0] = (uint)in[end];
6886 (out+j)[1] = (uint)in[start];
6892 (out+j)[0] = (uint)in[i];
6893 (out+j)[1] = (uint)in[i+1];
6896 (out+j)[0] = (uint)in[end];
6897 (out+j)[1] = (uint)in[start];
6908 uint * restrict out = (uint* restrict)_out;
6912 (out+j)[0] = (uint)in[i];
6913 (out+j)[1] = (uint)in[i+1];
6914 (out+j)[2] = (uint)in[i+2];
6926 uint * restrict out = (uint* restrict)_out;
6930 (out+j)[0] = (uint)in[i+(i&1)];
6931 (out+j)[1] = (uint)in[i+1-(i&1)];
6932 (out+j)[2] = (uint)in[i+2];
6944 uint * restrict out = (uint* restrict)_out;
6970 (out+j)[0] = (uint)in[start];
6971 (out+j)[1] = (uint)in[i+1];
6972 (out+j)[2] = (uint)in[i+2];
6984 uint * restrict out = (uint* restrict)_out;
7014 (out+j+0)[0] = (uint)in[i+0];
7015 (out+j+0)[1] = (uint)in[i+1];
7016 (out+j+0)[2] = (uint)in[i+3];
7017 (out+j+3)[0] = (uint)in[i+1];
7018 (out+j+3)[1] = (uint)in[i+2];
7019 (out+j+3)[2] = (uint)in[i+3];
7031 uint * restrict out = (uint* restrict)_out;
7061 (out+j+0)[0] = (uint)in[i+2];
7062 (out+j+0)[1] = (uint)in[i+0];
7063 (out+j+0)[2] = (uint)in[i+3];
7064 (out+j+3)[0] = (uint)in[i+0];
7065 (out+j+3)[1] = (uint)in[i+1];
7066 (out+j+3)[2] = (uint)in[i+3];
7078 uint * restrict out = (uint* restrict)_out;
7104 (out+j)[0] = (uint)in[i+1];
7105 (out+j)[1] = (uint)in[i+2];
7106 (out+j)[2] = (uint)in[start];
7118 uint * restrict out = (uint* restrict)_out;
7122 (out+j)[0] = (uint)in[i+0];
7123 (out+j)[1] = (uint)in[i+1];
7124 (out+j)[2] = (uint)in[i+2];
7125 (out+j)[3] = (uint)in[i+3];
7137 uint * restrict out = (uint* restrict)_out;
7141 (out+j)[0] = (uint)in[i+0];
7142 (out+j)[1] = (uint)in[i+1];
7143 (out+j)[2] = (uint)in[i+2];
7144 (out+j)[3] = (uint)in[i+3];
7156 uint * restrict out = (uint* restrict)_out;
7160 (out+j)[0] = (uint)in[i+0];
7161 (out+j)[1] = (uint)in[i+1];
7162 (out+j)[2] = (uint)in[i+2];
7163 (out+j)[3] = (uint)in[i+3];
7164 (out+j)[4] = (uint)in[i+4];
7165 (out+j)[5] = (uint)in[i+5];
7177 uint * restrict out = (uint* restrict)_out;
7183 (out+j)[0] = (uint)in[i+0];
7184 (out+j)[1] = (uint)in[i+1];
7185 (out+j)[2] = (uint)in[i+2];
7186 (out+j)[3] = (uint)in[i+3];
7187 (out+j)[4] = (uint)in[i+4];
7188 (out+j)[5] = (uint)in[i+5];
7191 (out+j)[0] = (uint)in[i+2];
7192 (out+j)[1] = (uint)in[i-2];
7193 (out+j)[2] = (uint)in[i+0];
7194 (out+j)[3] = (uint)in[i+3];
7195 (out+j)[4] = (uint)in[i+4];
7196 (out+j)[5] = (uint)in[i+6];
9897 uint * restrict out = (uint* restrict)_out;
9901 (out+j)[0] = (uint)in[i];
9913 uint * restrict out = (uint* restrict)_out;
9917 (out+j)[0] = (uint)in[i];
9918 (out+j)[1] = (uint)in[i+1];
9930 uint * restrict out = (uint* restrict)_out;
9934 (out+j)[0] = (uint)in[i];
9935 (out+j)[1] = (uint)in[i+1];
9947 uint * restrict out = (uint* restrict)_out;
9952 (out+j)[0] = (uint)in[i];
9953 (out+j)[1] = (uint)in[i+1];
9956 (out+j)[0] = (uint)in[end];
9957 (out+j)[1] = (uint)in[start];
9968 uint * restrict out = (uint* restrict)_out;
9972 (out+j)[0] = (uint)in[i];
9973 (out+j)[1] = (uint)in[i+1];
9974 (out+j)[2] = (uint)in[i+2];
9986 uint * restrict out = (uint* restrict)_out;
9990 (out+j)[0] = (uint)in[i];
9991 (out+j)[1] = (uint)in[i+1+(i&1)];
9992 (out+j)[2] = (uint)in[i+2-(i&1)];
10004 uint * restrict out = (uint* restrict)_out;
10008 (out+j)[0] = (uint)in[i+1];
10009 (out+j)[1] = (uint)in[i+2];
10010 (out+j)[2] = (uint)in[start];
10022 uint * restrict out = (uint* restrict)_out;
10026 (out+j+0)[0] = (uint)in[i+0];
10027 (out+j+0)[1] = (uint)in[i+1];
10028 (out+j+0)[2] = (uint)in[i+2];
10029 (out+j+3)[0] = (uint)in[i+0];
10030 (out+j+3)[1] = (uint)in[i+2];
10031 (out+j+3)[2] = (uint)in[i+3];
10043 uint * restrict out = (uint* restrict)_out;
10047 (out+j+0)[0] = (uint)in[i+0];
10048 (out+j+0)[1] = (uint)in[i+1];
10049 (out+j+0)[2] = (uint)in[i+3];
10050 (out+j+3)[0] = (uint)in[i+0];
10051 (out+j+3)[1] = (uint)in[i+3];
10052 (out+j+3)[2] = (uint)in[i+2];
10064 uint * restrict out = (uint* restrict)_out;
10068 (out+j)[0] = (uint)in[start];
10069 (out+j)[1] = (uint)in[i+1];
10070 (out+j)[2] = (uint)in[i+2];
10082 uint * restrict out = (uint* restrict)_out;
10086 (out+j)[0] = (uint)in[i+0];
10087 (out+j)[1] = (uint)in[i+1];
10088 (out+j)[2] = (uint)in[i+2];
10089 (out+j)[3] = (uint)in[i+3];
10101 uint * restrict out = (uint* restrict)_out;
10105 (out+j)[0] = (uint)in[i+0];
10106 (out+j)[1] = (uint)in[i+1];
10107 (out+j)[2] = (uint)in[i+2];
10108 (out+j)[3] = (uint)in[i+3];
10120 uint * restrict out = (uint* restrict)_out;
10124 (out+j)[0] = (uint)in[i+0];
10125 (out+j)[1] = (uint)in[i+1];
10126 (out+j)[2] = (uint)in[i+2];
10127 (out+j)[3] = (uint)in[i+3];
10128 (out+j)[4] = (uint)in[i+4];
10129 (out+j)[5] = (uint)in[i+5];
10141 uint * restrict out = (uint* restrict)_out;
10147 (out+j)[0] = (uint)in[i+0];
10148 (out+j)[1] = (uint)in[i+1];
10149 (out+j)[2] = (uint)in[i+2];
10150 (out+j)[3] = (uint)in[i+3];
10151 (out+j)[4] = (uint)in[i+4];
10152 (out+j)[5] = (uint)in[i+5];
10155 (out+j)[0] = (uint)in[i+2];
10156 (out+j)[1] = (uint)in[i-2];
10157 (out+j)[2] = (uint)in[i+0];
10158 (out+j)[3] = (uint)in[i+3];
10159 (out+j)[4] = (uint)in[i+4];
10160 (out+j)[5] = (uint)in[i+6];
10173 uint * restrict out = (uint* restrict)_out;
10177 (out+j)[0] = (uint)in[i];
10189 uint * restrict out = (uint* restrict)_out;
10193 (out+j)[0] = (uint)in[i];
10194 (out+j)[1] = (uint)in[i+1];
10206 uint * restrict out = (uint* restrict)_out;
10210 (out+j)[0] = (uint)in[i];
10211 (out+j)[1] = (uint)in[i+1];
10223 uint * restrict out = (uint* restrict)_out;
10236 (out+j)[0] = (uint)in[end];
10237 (out+j)[1] = (uint)in[start];
10245 (out+j)[0] = (uint)in[end];
10246 (out+j)[1] = (uint)in[start];
10252 (out+j)[0] = (uint)in[i];
10253 (out+j)[1] = (uint)in[i+1];
10256 (out+j)[0] = (uint)in[end];
10257 (out+j)[1] = (uint)in[start];
10268 uint * restrict out = (uint* restrict)_out;
10272 (out+j)[0] = (uint)in[i];
10273 (out+j)[1] = (uint)in[i+1];
10274 (out+j)[2] = (uint)in[i+2];
10286 uint * restrict out = (uint* restrict)_out;
10290 (out+j)[0] = (uint)in[i];
10291 (out+j)[1] = (uint)in[i+1+(i&1)];
10292 (out+j)[2] = (uint)in[i+2-(i&1)];
10304 uint * restrict out = (uint* restrict)_out;
10330 (out+j)[0] = (uint)in[i+1];
10331 (out+j)[1] = (uint)in[i+2];
10332 (out+j)[2] = (uint)in[start];
10344 uint * restrict out = (uint* restrict)_out;
10374 (out+j+0)[0] = (uint)in[i+0];
10375 (out+j+0)[1] = (uint)in[i+1];
10376 (out+j+0)[2] = (uint)in[i+2];
10377 (out+j+3)[0] = (uint)in[i+0];
10378 (out+j+3)[1] = (uint)in[i+2];
10379 (out+j+3)[2] = (uint)in[i+3];
10391 uint * restrict out = (uint* restrict)_out;
10421 (out+j+0)[0] = (uint)in[i+0];
10422 (out+j+0)[1] = (uint)in[i+1];
10423 (out+j+0)[2] = (uint)in[i+3];
10424 (out+j+3)[0] = (uint)in[i+0];
10425 (out+j+3)[1] = (uint)in[i+3];
10426 (out+j+3)[2] = (uint)in[i+2];
10438 uint * restrict out = (uint* restrict)_out;
10464 (out+j)[0] = (uint)in[start];
10465 (out+j)[1] = (uint)in[i+1];
10466 (out+j)[2] = (uint)in[i+2];
10478 uint * restrict out = (uint* restrict)_out;
10482 (out+j)[0] = (uint)in[i+0];
10483 (out+j)[1] = (uint)in[i+1];
10484 (out+j)[2] = (uint)in[i+2];
10485 (out+j)[3] = (uint)in[i+3];
10497 uint * restrict out = (uint* restrict)_out;
10501 (out+j)[0] = (uint)in[i+0];
10502 (out+j)[1] = (uint)in[i+1];
10503 (out+j)[2] = (uint)in[i+2];
10504 (out+j)[3] = (uint)in[i+3];
10516 uint * restrict out = (uint* restrict)_out;
10520 (out+j)[0] = (uint)in[i+0];
10521 (out+j)[1] = (uint)in[i+1];
10522 (out+j)[2] = (uint)in[i+2];
10523 (out+j)[3] = (uint)in[i+3];
10524 (out+j)[4] = (uint)in[i+4];
10525 (out+j)[5] = (uint)in[i+5];
10537 uint * restrict out = (uint* restrict)_out;
10543 (out+j)[0] = (uint)in[i+0];
10544 (out+j)[1] = (uint)in[i+1];
10545 (out+j)[2] = (uint)in[i+2];
10546 (out+j)[3] = (uint)in[i+3];
10547 (out+j)[4] = (uint)in[i+4];
10548 (out+j)[5] = (uint)in[i+5];
10551 (out+j)[0] = (uint)in[i+2];
10552 (out+j)[1] = (uint)in[i-2];
10553 (out+j)[2] = (uint)in[i+0];
10554 (out+j)[3] = (uint)in[i+3];
10555 (out+j)[4] = (uint)in[i+4];
10556 (out+j)[5] = (uint)in[i+6];
10569 uint * restrict out = (uint* restrict)_out;
10573 (out+j)[0] = (uint)in[i];
10585 uint * restrict out = (uint* restrict)_out;
10589 (out+j)[0] = (uint)in[i+1];
10590 (out+j)[1] = (uint)in[i];
10602 uint * restrict out = (uint* restrict)_out;
10606 (out+j)[0] = (uint)in[i+1];
10607 (out+j)[1] = (uint)in[i];
10619 uint * restrict out = (uint* restrict)_out;
10624 (out+j)[0] = (uint)in[i+1];
10625 (out+j)[1] = (uint)in[i];
10628 (out+j)[0] = (uint)in[start];
10629 (out+j)[1] = (uint)in[end];
10640 uint * restrict out = (uint* restrict)_out;
10644 (out+j)[0] = (uint)in[i+1];
10645 (out+j)[1] = (uint)in[i+2];
10646 (out+j)[2] = (uint)in[i];
10658 uint * restrict out = (uint* restrict)_out;
10662 (out+j)[0] = (uint)in[i+1+(i&1)];
10663 (out+j)[1] = (uint)in[i+2-(i&1)];
10664 (out+j)[2] = (uint)in[i];
10676 uint * restrict out = (uint* restrict)_out;
10680 (out+j)[0] = (uint)in[i+2];
10681 (out+j)[1] = (uint)in[start];
10682 (out+j)[2] = (uint)in[i+1];
10694 uint * restrict out = (uint* restrict)_out;
10698 (out+j+0)[0] = (uint)in[i+1];
10699 (out+j+0)[1] = (uint)in[i+2];
10700 (out+j+0)[2] = (uint)in[i+0];
10701 (out+j+3)[0] = (uint)in[i+2];
10702 (out+j+3)[1] = (uint)in[i+3];
10703 (out+j+3)[2] = (uint)in[i+0];
10715 uint * restrict out = (uint* restrict)_out;
10719 (out+j+0)[0] = (uint)in[i+1];
10720 (out+j+0)[1] = (uint)in[i+3];
10721 (out+j+0)[2] = (uint)in[i+0];
10722 (out+j+3)[0] = (uint)in[i+3];
10723 (out+j+3)[1] = (uint)in[i+2];
10724 (out+j+3)[2] = (uint)in[i+0];
10736 uint * restrict out = (uint* restrict)_out;
10740 (out+j)[0] = (uint)in[i+1];
10741 (out+j)[1] = (uint)in[i+2];
10742 (out+j)[2] = (uint)in[start];
10754 uint * restrict out = (uint* restrict)_out;
10758 (out+j)[0] = (uint)in[i+3];
10759 (out+j)[1] = (uint)in[i+2];
10760 (out+j)[2] = (uint)in[i+1];
10761 (out+j)[3] = (uint)in[i+0];
10773 uint * restrict out = (uint* restrict)_out;
10777 (out+j)[0] = (uint)in[i+3];
10778 (out+j)[1] = (uint)in[i+2];
10779 (out+j)[2] = (uint)in[i+1];
10780 (out+j)[3] = (uint)in[i+0];
10792 uint * restrict out = (uint* restrict)_out;
10796 (out+j)[0] = (uint)in[i+4];
10797 (out+j)[1] = (uint)in[i+5];
10798 (out+j)[2] = (uint)in[i+0];
10799 (out+j)[3] = (uint)in[i+1];
10800 (out+j)[4] = (uint)in[i+2];
10801 (out+j)[5] = (uint)in[i+3];
10813 uint * restrict out = (uint* restrict)_out;
10819 (out+j)[0] = (uint)in[i+4];
10820 (out+j)[1] = (uint)in[i+5];
10821 (out+j)[2] = (uint)in[i+0];
10822 (out+j)[3] = (uint)in[i+1];
10823 (out+j)[4] = (uint)in[i+2];
10824 (out+j)[5] = (uint)in[i+3];
10827 (out+j)[0] = (uint)in[i+4];
10828 (out+j)[1] = (uint)in[i+6];
10829 (out+j)[2] = (uint)in[i+2];
10830 (out+j)[3] = (uint)in[i-2];
10831 (out+j)[4] = (uint)in[i+0];
10832 (out+j)[5] = (uint)in[i+3];
10845 uint * restrict out = (uint* restrict)_out;
10849 (out+j)[0] = (uint)in[i];
10861 uint * restrict out = (uint* restrict)_out;
10865 (out+j)[0] = (uint)in[i+1];
10866 (out+j)[1] = (uint)in[i];
10878 uint * restrict out = (uint* restrict)_out;
10882 (out+j)[0] = (uint)in[i+1];
10883 (out+j)[1] = (uint)in[i];
10895 uint * restrict out = (uint* restrict)_out;
10908 (out+j)[0] = (uint)in[start];
10909 (out+j)[1] = (uint)in[end];
10917 (out+j)[0] = (uint)in[start];
10918 (out+j)[1] = (uint)in[end];
10924 (out+j)[0] = (uint)in[i+1];
10925 (out+j)[1] = (uint)in[i];
10928 (out+j)[0] = (uint)in[start];
10929 (out+j)[1] = (uint)in[end];
10940 uint * restrict out = (uint* restrict)_out;
10944 (out+j)[0] = (uint)in[i+1];
10945 (out+j)[1] = (uint)in[i+2];
10946 (out+j)[2] = (uint)in[i];
10958 uint * restrict out = (uint* restrict)_out;
10962 (out+j)[0] = (uint)in[i+1+(i&1)];
10963 (out+j)[1] = (uint)in[i+2-(i&1)];
10964 (out+j)[2] = (uint)in[i];
10976 uint * restrict out = (uint* restrict)_out;
11002 (out+j)[0] = (uint)in[i+2];
11003 (out+j)[1] = (uint)in[start];
11004 (out+j)[2] = (uint)in[i+1];
11016 uint * restrict out = (uint* restrict)_out;
11046 (out+j+0)[0] = (uint)in[i+1];
11047 (out+j+0)[1] = (uint)in[i+2];
11048 (out+j+0)[2] = (uint)in[i+0];
11049 (out+j+3)[0] = (uint)in[i+2];
11050 (out+j+3)[1] = (uint)in[i+3];
11051 (out+j+3)[2] = (uint)in[i+0];
11063 uint * restrict out = (uint* restrict)_out;
11093 (out+j+0)[0] = (uint)in[i+1];
11094 (out+j+0)[1] = (uint)in[i+3];
11095 (out+j+0)[2] = (uint)in[i+0];
11096 (out+j+3)[0] = (uint)in[i+3];
11097 (out+j+3)[1] = (uint)in[i+2];
11098 (out+j+3)[2] = (uint)in[i+0];
11110 uint * restrict out = (uint* restrict)_out;
11136 (out+j)[0] = (uint)in[i+1];
11137 (out+j)[1] = (uint)in[i+2];
11138 (out+j)[2] = (uint)in[start];
11150 uint * restrict out = (uint* restrict)_out;
11154 (out+j)[0] = (uint)in[i+3];
11155 (out+j)[1] = (uint)in[i+2];
11156 (out+j)[2] = (uint)in[i+1];
11157 (out+j)[3] = (uint)in[i+0];
11169 uint * restrict out = (uint* restrict)_out;
11173 (out+j)[0] = (uint)in[i+3];
11174 (out+j)[1] = (uint)in[i+2];
11175 (out+j)[2] = (uint)in[i+1];
11176 (out+j)[3] = (uint)in[i+0];
11188 uint * restrict out = (uint* restrict)_out;
11192 (out+j)[0] = (uint)in[i+4];
11193 (out+j)[1] = (uint)in[i+5];
11194 (out+j)[2] = (uint)in[i+0];
11195 (out+j)[3] = (uint)in[i+1];
11196 (out+j)[4] = (uint)in[i+2];
11197 (out+j)[5] = (uint)in[i+3];
11209 uint * restrict out = (uint* restrict)_out;
11215 (out+j)[0] = (uint)in[i+4];
11216 (out+j)[1] = (uint)in[i+5];
11217 (out+j)[2] = (uint)in[i+0];
11218 (out+j)[3] = (uint)in[i+1];
11219 (out+j)[4] = (uint)in[i+2];
11220 (out+j)[5] = (uint)in[i+3];
11223 (out+j)[0] = (uint)in[i+4];
11224 (out+j)[1] = (uint)in[i+6];
11225 (out+j)[2] = (uint)in[i+2];
11226 (out+j)[3] = (uint)in[i-2];
11227 (out+j)[4] = (uint)in[i+0];
11228 (out+j)[5] = (uint)in[i+3];
11241 uint * restrict out = (uint* restrict)_out;
11245 (out+j)[0] = (uint)in[i];
11257 uint * restrict out = (uint* restrict)_out;
11261 (out+j)[0] = (uint)in[i+1];
11262 (out+j)[1] = (uint)in[i];
11274 uint * restrict out = (uint* restrict)_out;
11278 (out+j)[0] = (uint)in[i+1];
11279 (out+j)[1] = (uint)in[i];
11291 uint * restrict out = (uint* restrict)_out;
11296 (out+j)[0] = (uint)in[i+1];
11297 (out+j)[1] = (uint)in[i];
11300 (out+j)[0] = (uint)in[start];
11301 (out+j)[1] = (uint)in[end];
11312 uint * restrict out = (uint* restrict)_out;
11316 (out+j)[0] = (uint)in[i+2];
11317 (out+j)[1] = (uint)in[i];
11318 (out+j)[2] = (uint)in[i+1];
11330 uint * restrict out = (uint* restrict)_out;
11334 (out+j)[0] = (uint)in[i+2];
11335 (out+j)[1] = (uint)in[i+(i&1)];
11336 (out+j)[2] = (uint)in[i+1-(i&1)];
11348 uint * restrict out = (uint* restrict)_out;
11352 (out+j)[0] = (uint)in[i+2];
11353 (out+j)[1] = (uint)in[start];
11354 (out+j)[2] = (uint)in[i+1];
11366 uint * restrict out = (uint* restrict)_out;
11370 (out+j+0)[0] = (uint)in[i+3];
11371 (out+j+0)[1] = (uint)in[i+0];
11372 (out+j+0)[2] = (uint)in[i+1];
11373 (out+j+3)[0] = (uint)in[i+3];
11374 (out+j+3)[1] = (uint)in[i+1];
11375 (out+j+3)[2] = (uint)in[i+2];
11387 uint * restrict out = (uint* restrict)_out;
11391 (out+j+0)[0] = (uint)in[i+3];
11392 (out+j+0)[1] = (uint)in[i+2];
11393 (out+j+0)[2] = (uint)in[i+0];
11394 (out+j+3)[0] = (uint)in[i+3];
11395 (out+j+3)[1] = (uint)in[i+0];
11396 (out+j+3)[2] = (uint)in[i+1];
11408 uint * restrict out = (uint* restrict)_out;
11412 (out+j)[0] = (uint)in[start];
11413 (out+j)[1] = (uint)in[i+1];
11414 (out+j)[2] = (uint)in[i+2];
11426 uint * restrict out = (uint* restrict)_out;
11430 (out+j)[0] = (uint)in[i+3];
11431 (out+j)[1] = (uint)in[i+2];
11432 (out+j)[2] = (uint)in[i+1];
11433 (out+j)[3] = (uint)in[i+0];
11445 uint * restrict out = (uint* restrict)_out;
11449 (out+j)[0] = (uint)in[i+3];
11450 (out+j)[1] = (uint)in[i+2];
11451 (out+j)[2] = (uint)in[i+1];
11452 (out+j)[3] = (uint)in[i+0];
11464 uint * restrict out = (uint* restrict)_out;
11468 (out+j)[0] = (uint)in[i+4];
11469 (out+j)[1] = (uint)in[i+5];
11470 (out+j)[2] = (uint)in[i+0];
11471 (out+j)[3] = (uint)in[i+1];
11472 (out+j)[4] = (uint)in[i+2];
11473 (out+j)[5] = (uint)in[i+3];
11485 uint * restrict out = (uint* restrict)_out;
11491 (out+j)[0] = (uint)in[i+4];
11492 (out+j)[1] = (uint)in[i+5];
11493 (out+j)[2] = (uint)in[i+0];
11494 (out+j)[3] = (uint)in[i+1];
11495 (out+j)[4] = (uint)in[i+2];
11496 (out+j)[5] = (uint)in[i+3];
11499 (out+j)[0] = (uint)in[i+4];
11500 (out+j)[1] = (uint)in[i+6];
11501 (out+j)[2] = (uint)in[i+2];
11502 (out+j)[3] = (uint)in[i-2];
11503 (out+j)[4] = (uint)in[i+0];
11504 (out+j)[5] = (uint)in[i+3];
11517 uint * restrict out = (uint* restrict)_out;
11521 (out+j)[0] = (uint)in[i];
11533 uint * restrict out = (uint* restrict)_out;
11537 (out+j)[0] = (uint)in[i+1];
11538 (out+j)[1] = (uint)in[i];
11550 uint * restrict out = (uint* restrict)_out;
11554 (out+j)[0] = (uint)in[i+1];
11555 (out+j)[1] = (uint)in[i];
11567 uint * restrict out = (uint* restrict)_out;
11580 (out+j)[0] = (uint)in[start];
11581 (out+j)[1] = (uint)in[end];
11589 (out+j)[0] = (uint)in[start];
11590 (out+j)[1] = (uint)in[end];
11596 (out+j)[0] = (uint)in[i+1];
11597 (out+j)[1] = (uint)in[i];
11600 (out+j)[0] = (uint)in[start];
11601 (out+j)[1] = (uint)in[end];
11612 uint * restrict out = (uint* restrict)_out;
11616 (out+j)[0] = (uint)in[i+2];
11617 (out+j)[1] = (uint)in[i];
11618 (out+j)[2] = (uint)in[i+1];
11630 uint * restrict out = (uint* restrict)_out;
11634 (out+j)[0] = (uint)in[i+2];
11635 (out+j)[1] = (uint)in[i+(i&1)];
11636 (out+j)[2] = (uint)in[i+1-(i&1)];
11648 uint * restrict out = (uint* restrict)_out;
11674 (out+j)[0] = (uint)in[i+2];
11675 (out+j)[1] = (uint)in[start];
11676 (out+j)[2] = (uint)in[i+1];
11688 uint * restrict out = (uint* restrict)_out;
11718 (out+j+0)[0] = (uint)in[i+3];
11719 (out+j+0)[1] = (uint)in[i+0];
11720 (out+j+0)[2] = (uint)in[i+1];
11721 (out+j+3)[0] = (uint)in[i+3];
11722 (out+j+3)[1] = (uint)in[i+1];
11723 (out+j+3)[2] = (uint)in[i+2];
11735 uint * restrict out = (uint* restrict)_out;
11765 (out+j+0)[0] = (uint)in[i+3];
11766 (out+j+0)[1] = (uint)in[i+2];
11767 (out+j+0)[2] = (uint)in[i+0];
11768 (out+j+3)[0] = (uint)in[i+3];
11769 (out+j+3)[1] = (uint)in[i+0];
11770 (out+j+3)[2] = (uint)in[i+1];
11782 uint * restrict out = (uint* restrict)_out;
11808 (out+j)[0] = (uint)in[start];
11809 (out+j)[1] = (uint)in[i+1];
11810 (out+j)[2] = (uint)in[i+2];
11822 uint * restrict out = (uint* restrict)_out;
11826 (out+j)[0] = (uint)in[i+3];
11827 (out+j)[1] = (uint)in[i+2];
11828 (out+j)[2] = (uint)in[i+1];
11829 (out+j)[3] = (uint)in[i+0];
11841 uint * restrict out = (uint* restrict)_out;
11845 (out+j)[0] = (uint)in[i+3];
11846 (out+j)[1] = (uint)in[i+2];
11847 (out+j)[2] = (uint)in[i+1];
11848 (out+j)[3] = (uint)in[i+0];
11860 uint * restrict out = (uint* restrict)_out;
11864 (out+j)[0] = (uint)in[i+4];
11865 (out+j)[1] = (uint)in[i+5];
11866 (out+j)[2] = (uint)in[i+0];
11867 (out+j)[3] = (uint)in[i+1];
11868 (out+j)[4] = (uint)in[i+2];
11869 (out+j)[5] = (uint)in[i+3];
11881 uint * restrict out = (uint* restrict)_out;
11887 (out+j)[0] = (uint)in[i+4];
11888 (out+j)[1] = (uint)in[i+5];
11889 (out+j)[2] = (uint)in[i+0];
11890 (out+j)[3] = (uint)in[i+1];
11891 (out+j)[4] = (uint)in[i+2];
11892 (out+j)[5] = (uint)in[i+3];
11895 (out+j)[0] = (uint)in[i+4];
11896 (out+j)[1] = (uint)in[i+6];
11897 (out+j)[2] = (uint)in[i+2];
11898 (out+j)[3] = (uint)in[i-2];
11899 (out+j)[4] = (uint)in[i+0];
11900 (out+j)[5] = (uint)in[i+3];
11913 uint * restrict out = (uint* restrict)_out;
11917 (out+j)[0] = (uint)in[i];
11929 uint * restrict out = (uint* restrict)_out;
11933 (out+j)[0] = (uint)in[i];
11934 (out+j)[1] = (uint)in[i+1];
11946 uint * restrict out = (uint* restrict)_out;
11950 (out+j)[0] = (uint)in[i];
11951 (out+j)[1] = (uint)in[i+1];
11963 uint * restrict out = (uint* restrict)_out;
11968 (out+j)[0] = (uint)in[i];
11969 (out+j)[1] = (uint)in[i+1];
11972 (out+j)[0] = (uint)in[end];
11973 (out+j)[1] = (uint)in[start];
11984 uint * restrict out = (uint* restrict)_out;
11988 (out+j)[0] = (uint)in[i];
11989 (out+j)[1] = (uint)in[i+1];
11990 (out+j)[2] = (uint)in[i+2];
12002 uint * restrict out = (uint* restrict)_out;
12006 (out+j)[0] = (uint)in[i+(i&1)];
12007 (out+j)[1] = (uint)in[i+1-(i&1)];
12008 (out+j)[2] = (uint)in[i+2];
12020 uint * restrict out = (uint* restrict)_out;
12024 (out+j)[0] = (uint)in[start];
12025 (out+j)[1] = (uint)in[i+1];
12026 (out+j)[2] = (uint)in[i+2];
12038 uint * restrict out = (uint* restrict)_out;
12042 (out+j+0)[0] = (uint)in[i+0];
12043 (out+j+0)[1] = (uint)in[i+1];
12044 (out+j+0)[2] = (uint)in[i+3];
12045 (out+j+3)[0] = (uint)in[i+1];
12046 (out+j+3)[1] = (uint)in[i+2];
12047 (out+j+3)[2] = (uint)in[i+3];
12059 uint * restrict out = (uint* restrict)_out;
12063 (out+j+0)[0] = (uint)in[i+2];
12064 (out+j+0)[1] = (uint)in[i+0];
12065 (out+j+0)[2] = (uint)in[i+3];
12066 (out+j+3)[0] = (uint)in[i+0];
12067 (out+j+3)[1] = (uint)in[i+1];
12068 (out+j+3)[2] = (uint)in[i+3];
12080 uint * restrict out = (uint* restrict)_out;
12084 (out+j)[0] = (uint)in[i+1];
12085 (out+j)[1] = (uint)in[i+2];
12086 (out+j)[2] = (uint)in[start];
12098 uint * restrict out = (uint* restrict)_out;
12102 (out+j)[0] = (uint)in[i+0];
12103 (out+j)[1] = (uint)in[i+1];
12104 (out+j)[2] = (uint)in[i+2];
12105 (out+j)[3] = (uint)in[i+3];
12117 uint * restrict out = (uint* restrict)_out;
12121 (out+j)[0] = (uint)in[i+0];
12122 (out+j)[1] = (uint)in[i+1];
12123 (out+j)[2] = (uint)in[i+2];
12124 (out+j)[3] = (uint)in[i+3];
12136 uint * restrict out = (uint* restrict)_out;
12140 (out+j)[0] = (uint)in[i+0];
12141 (out+j)[1] = (uint)in[i+1];
12142 (out+j)[2] = (uint)in[i+2];
12143 (out+j)[3] = (uint)in[i+3];
12144 (out+j)[4] = (uint)in[i+4];
12145 (out+j)[5] = (uint)in[i+5];
12157 uint * restrict out = (uint* restrict)_out;
12163 (out+j)[0] = (uint)in[i+0];
12164 (out+j)[1] = (uint)in[i+1];
12165 (out+j)[2] = (uint)in[i+2];
12166 (out+j)[3] = (uint)in[i+3];
12167 (out+j)[4] = (uint)in[i+4];
12168 (out+j)[5] = (uint)in[i+5];
12171 (out+j)[0] = (uint)in[i+2];
12172 (out+j)[1] = (uint)in[i-2];
12173 (out+j)[2] = (uint)in[i+0];
12174 (out+j)[3] = (uint)in[i+3];
12175 (out+j)[4] = (uint)in[i+4];
12176 (out+j)[5] = (uint)in[i+6];
12189 uint * restrict out = (uint* restrict)_out;
12193 (out+j)[0] = (uint)in[i];
12205 uint * restrict out = (uint* restrict)_out;
12209 (out+j)[0] = (uint)in[i];
12210 (out+j)[1] = (uint)in[i+1];
12222 uint * restrict out = (uint* restrict)_out;
12226 (out+j)[0] = (uint)in[i];
12227 (out+j)[1] = (uint)in[i+1];
12239 uint * restrict out = (uint* restrict)_out;
12252 (out+j)[0] = (uint)in[end];
12253 (out+j)[1] = (uint)in[start];
12261 (out+j)[0] = (uint)in[end];
12262 (out+j)[1] = (uint)in[start];
12268 (out+j)[0] = (uint)in[i];
12269 (out+j)[1] = (uint)in[i+1];
12272 (out+j)[0] = (uint)in[end];
12273 (out+j)[1] = (uint)in[start];
12284 uint * restrict out = (uint* restrict)_out;
12288 (out+j)[0] = (uint)in[i];
12289 (out+j)[1] = (uint)in[i+1];
12290 (out+j)[2] = (uint)in[i+2];
12302 uint * restrict out = (uint* restrict)_out;
12306 (out+j)[0] = (uint)in[i+(i&1)];
12307 (out+j)[1] = (uint)in[i+1-(i&1)];
12308 (out+j)[2] = (uint)in[i+2];
12320 uint * restrict out = (uint* restrict)_out;
12346 (out+j)[0] = (uint)in[start];
12347 (out+j)[1] = (uint)in[i+1];
12348 (out+j)[2] = (uint)in[i+2];
12360 uint * restrict out = (uint* restrict)_out;
12390 (out+j+0)[0] = (uint)in[i+0];
12391 (out+j+0)[1] = (uint)in[i+1];
12392 (out+j+0)[2] = (uint)in[i+3];
12393 (out+j+3)[0] = (uint)in[i+1];
12394 (out+j+3)[1] = (uint)in[i+2];
12395 (out+j+3)[2] = (uint)in[i+3];
12407 uint * restrict out = (uint* restrict)_out;
12437 (out+j+0)[0] = (uint)in[i+2];
12438 (out+j+0)[1] = (uint)in[i+0];
12439 (out+j+0)[2] = (uint)in[i+3];
12440 (out+j+3)[0] = (uint)in[i+0];
12441 (out+j+3)[1] = (uint)in[i+1];
12442 (out+j+3)[2] = (uint)in[i+3];
12454 uint * restrict out = (uint* restrict)_out;
12480 (out+j)[0] = (uint)in[i+1];
12481 (out+j)[1] = (uint)in[i+2];
12482 (out+j)[2] = (uint)in[start];
12494 uint * restrict out = (uint* restrict)_out;
12498 (out+j)[0] = (uint)in[i+0];
12499 (out+j)[1] = (uint)in[i+1];
12500 (out+j)[2] = (uint)in[i+2];
12501 (out+j)[3] = (uint)in[i+3];
12513 uint * restrict out = (uint* restrict)_out;
12517 (out+j)[0] = (uint)in[i+0];
12518 (out+j)[1] = (uint)in[i+1];
12519 (out+j)[2] = (uint)in[i+2];
12520 (out+j)[3] = (uint)in[i+3];
12532 uint * restrict out = (uint* restrict)_out;
12536 (out+j)[0] = (uint)in[i+0];
12537 (out+j)[1] = (uint)in[i+1];
12538 (out+j)[2] = (uint)in[i+2];
12539 (out+j)[3] = (uint)in[i+3];
12540 (out+j)[4] = (uint)in[i+4];
12541 (out+j)[5] = (uint)in[i+5];
12553 uint * restrict out = (uint* restrict)_out;
12559 (out+j)[0] = (uint)in[i+0];
12560 (out+j)[1] = (uint)in[i+1];
12561 (out+j)[2] = (uint)in[i+2];
12562 (out+j)[3] = (uint)in[i+3];
12563 (out+j)[4] = (uint)in[i+4];
12564 (out+j)[5] = (uint)in[i+5];
12567 (out+j)[0] = (uint)in[i+2];
12568 (out+j)[1] = (uint)in[i-2];
12569 (out+j)[2] = (uint)in[i+0];
12570 (out+j)[3] = (uint)in[i+3];
12571 (out+j)[4] = (uint)in[i+4];
12572 (out+j)[5] = (uint)in[i+6];
12584 const uint* restrict in = (const uint* restrict)_in;
12600 const uint* restrict in = (const uint* restrict)_in;
12617 const uint* restrict in = (const uint* restrict)_in;
12634 const uint* restrict in = (const uint* restrict)_in;
12655 const uint* restrict in = (const uint* restrict)_in;
12673 const uint* restrict in = (const uint* restrict)_in;
12691 const uint* restrict in = (const uint* restrict)_in;
12709 const uint* restrict in = (const uint* restrict)_in;
12730 const uint* restrict in = (const uint* restrict)_in;
12751 const uint* restrict in = (const uint* restrict)_in;
12769 const uint* restrict in = (const uint* restrict)_in;
12788 const uint* restrict in = (const uint* restrict)_in;
12807 const uint* restrict in = (const uint* restrict)_in;
12828 const uint* restrict in = (const uint* restrict)_in;
12860 const uint* restrict in = (const uint* restrict)_in;
12876 const uint* restrict in = (const uint* restrict)_in;
12893 const uint* restrict in = (const uint* restrict)_in;
12910 const uint* restrict in = (const uint* restrict)_in;
12955 const uint* restrict in = (const uint* restrict)_in;
12973 const uint* restrict in = (const uint* restrict)_in;
12991 const uint* restrict in = (const uint* restrict)_in;
13031 const uint* restrict in = (const uint* restrict)_in;
13078 const uint* restrict in = (const uint* restrict)_in;
13125 const uint* restrict in = (const uint* restrict)_in;
13165 const uint* restrict in = (const uint* restrict)_in;
13184 const uint* restrict in = (const uint* restrict)_in;
13203 const uint* restrict in = (const uint* restrict)_in;
13224 const uint* restrict in = (const uint* restrict)_in;
13256 const uint* restrict in = (const uint* restrict)_in;
13272 const uint* restrict in = (const uint* restrict)_in;
13289 const uint* restrict in = (const uint* restrict)_in;
13306 const uint* restrict in = (const uint* restrict)_in;
13327 const uint* restrict in = (const uint* restrict)_in;
13345 const uint* restrict in = (const uint* restrict)_in;
13363 const uint* restrict in = (const uint* restrict)_in;
13381 const uint* restrict in = (const uint* restrict)_in;
13402 const uint* restrict in = (const uint* restrict)_in;
13423 const uint* restrict in = (const uint* restrict)_in;
13441 const uint* restrict in = (const uint* restrict)_in;
13460 const uint* restrict in = (const uint* restrict)_in;
13479 const uint* restrict in = (const uint* restrict)_in;
13500 const uint* restrict in = (const uint* restrict)_in;
13532 const uint* restrict in = (const uint* restrict)_in;
13548 const uint* restrict in = (const uint* restrict)_in;
13565 const uint* restrict in = (const uint* restrict)_in;
13582 const uint* restrict in = (const uint* restrict)_in;
13627 const uint* restrict in = (const uint* restrict)_in;
13645 const uint* restrict in = (const uint* restrict)_in;
13663 const uint* restrict in = (const uint* restrict)_in;
13703 const uint* restrict in = (const uint* restrict)_in;
13750 const uint* restrict in = (const uint* restrict)_in;
13797 const uint* restrict in = (const uint* restrict)_in;
13837 const uint* restrict in = (const uint* restrict)_in;
13856 const uint* restrict in = (const uint* restrict)_in;
13875 const uint* restrict in = (const uint* restrict)_in;
13896 const uint* restrict in = (const uint* restrict)_in;
13928 const uint* restrict in = (const uint* restrict)_in;
13944 const uint* restrict in = (const uint* restrict)_in;
13961 const uint* restrict in = (const uint* restrict)_in;
13978 const uint* restrict in = (const uint* restrict)_in;
13999 const uint* restrict in = (const uint* restrict)_in;
14017 const uint* restrict in = (const uint* restrict)_in;
14035 const uint* restrict in = (const uint* restrict)_in;
14053 const uint* restrict in = (const uint* restrict)_in;
14074 const uint* restrict in = (const uint* restrict)_in;
14095 const uint* restrict in = (const uint* restrict)_in;
14113 const uint* restrict in = (const uint* restrict)_in;
14132 const uint* restrict in = (const uint* restrict)_in;
14151 const uint* restrict in = (const uint* restrict)_in;
14172 const uint* restrict in = (const uint* restrict)_in;
14204 const uint* restrict in = (const uint* restrict)_in;
14220 const uint* restrict in = (const uint* restrict)_in;
14237 const uint* restrict in = (const uint* restrict)_in;
14254 const uint* restrict in = (const uint* restrict)_in;
14299 const uint* restrict in = (const uint* restrict)_in;
14317 const uint* restrict in = (const uint* restrict)_in;
14335 const uint* restrict in = (const uint* restrict)_in;
14375 const uint* restrict in = (const uint* restrict)_in;
14422 const uint* restrict in = (const uint* restrict)_in;
14469 const uint* restrict in = (const uint* restrict)_in;
14509 const uint* restrict in = (const uint* restrict)_in;
14528 const uint* restrict in = (const uint* restrict)_in;
14547 const uint* restrict in = (const uint* restrict)_in;
14568 const uint* restrict in = (const uint* restrict)_in;
14600 const uint* restrict in = (const uint* restrict)_in;
14616 const uint* restrict in = (const uint* restrict)_in;
14633 const uint* restrict in = (const uint* restrict)_in;
14650 const uint* restrict in = (const uint* restrict)_in;
14671 const uint* restrict in = (const uint* restrict)_in;
14689 const uint* restrict in = (const uint* restrict)_in;
14707 const uint* restrict in = (const uint* restrict)_in;
14725 const uint* restrict in = (const uint* restrict)_in;
14746 const uint* restrict in = (const uint* restrict)_in;
14767 const uint* restrict in = (const uint* restrict)_in;
14785 const uint* restrict in = (const uint* restrict)_in;
14804 const uint* restrict in = (const uint* restrict)_in;
14823 const uint* restrict in = (const uint* restrict)_in;
14844 const uint* restrict in = (const uint* restrict)_in;
14876 const uint* restrict in = (const uint* restrict)_in;
14892 const uint* restrict in = (const uint* restrict)_in;
14909 const uint* restrict in = (const uint* restrict)_in;
14926 const uint* restrict in = (const uint* restrict)_in;
14971 const uint* restrict in = (const uint* restrict)_in;
14989 const uint* restrict in = (const uint* restrict)_in;
15007 const uint* restrict in = (const uint* restrict)_in;
15047 const uint* restrict in = (const uint* restrict)_in;
15094 const uint* restrict in = (const uint* restrict)_in;
15141 const uint* restrict in = (const uint* restrict)_in;
15181 const uint* restrict in = (const uint* restrict)_in;
15200 const uint* restrict in = (const uint* restrict)_in;
15219 const uint* restrict in = (const uint* restrict)_in;
15240 const uint* restrict in = (const uint* restrict)_in;
15272 const uint* restrict in = (const uint* restrict)_in;
15273 uint * restrict out = (uint* restrict)_out;
15277 (out+j)[0] = (uint)in[i];
15288 const uint* restrict in = (const uint* restrict)_in;
15289 uint * restrict out = (uint* restrict)_out;
15293 (out+j)[0] = (uint)in[i];
15294 (out+j)[1] = (uint)in[i+1];
15305 const uint* restrict in = (const uint* restrict)_in;
15306 uint * restrict out = (uint* restrict)_out;
15310 (out+j)[0] = (uint)in[i];
15311 (out+j)[1] = (uint)in[i+1];
15322 const uint* restrict in = (const uint* restrict)_in;
15323 uint * restrict out = (uint* restrict)_out;
15328 (out+j)[0] = (uint)in[i];
15329 (out+j)[1] = (uint)in[i+1];
15332 (out+j)[0] = (uint)in[end];
15333 (out+j)[1] = (uint)in[start];
15343 const uint* restrict in = (const uint* restrict)_in;
15344 uint * restrict out = (uint* restrict)_out;
15348 (out+j)[0] = (uint)in[i];
15349 (out+j)[1] = (uint)in[i+1];
15350 (out+j)[2] = (uint)in[i+2];
15361 const uint* restrict in = (const uint* restrict)_in;
15362 uint * restrict out = (uint* restrict)_out;
15366 (out+j)[0] = (uint)in[i];
15367 (out+j)[1] = (uint)in[i+1+(i&1)];
15368 (out+j)[2] = (uint)in[i+2-(i&1)];
15379 const uint* restrict in = (const uint* restrict)_in;
15380 uint * restrict out = (uint* restrict)_out;
15384 (out+j)[0] = (uint)in[i+1];
15385 (out+j)[1] = (uint)in[i+2];
15386 (out+j)[2] = (uint)in[start];
15397 const uint* restrict in = (const uint* restrict)_in;
15398 uint * restrict out = (uint* restrict)_out;
15402 (out+j+0)[0] = (uint)in[i+0];
15403 (out+j+0)[1] = (uint)in[i+1];
15404 (out+j+0)[2] = (uint)in[i+2];
15405 (out+j+3)[0] = (uint)in[i+0];
15406 (out+j+3)[1] = (uint)in[i+2];
15407 (out+j+3)[2] = (uint)in[i+3];
15418 const uint* restrict in = (const uint* restrict)_in;
15419 uint * restrict out = (uint* restrict)_out;
15423 (out+j+0)[0] = (uint)in[i+0];
15424 (out+j+0)[1] = (uint)in[i+1];
15425 (out+j+0)[2] = (uint)in[i+3];
15426 (out+j+3)[0] = (uint)in[i+0];
15427 (out+j+3)[1] = (uint)in[i+3];
15428 (out+j+3)[2] = (uint)in[i+2];
15439 const uint* restrict in = (const uint* restrict)_in;
15440 uint * restrict out = (uint* restrict)_out;
15444 (out+j)[0] = (uint)in[start];
15445 (out+j)[1] = (uint)in[i+1];
15446 (out+j)[2] = (uint)in[i+2];
15457 const uint* restrict in = (const uint* restrict)_in;
15458 uint * restrict out = (uint* restrict)_out;
15462 (out+j)[0] = (uint)in[i+0];
15463 (out+j)[1] = (uint)in[i+1];
15464 (out+j)[2] = (uint)in[i+2];
15465 (out+j)[3] = (uint)in[i+3];
15476 const uint* restrict in = (const uint* restrict)_in;
15477 uint * restrict out = (uint* restrict)_out;
15481 (out+j)[0] = (uint)in[i+0];
15482 (out+j)[1] = (uint)in[i+1];
15483 (out+j)[2] = (uint)in[i+2];
15484 (out+j)[3] = (uint)in[i+3];
15495 const uint* restrict in = (const uint* restrict)_in;
15496 uint * restrict out = (uint* restrict)_out;
15500 (out+j)[0] = (uint)in[i+0];
15501 (out+j)[1] = (uint)in[i+1];
15502 (out+j)[2] = (uint)in[i+2];
15503 (out+j)[3] = (uint)in[i+3];
15504 (out+j)[4] = (uint)in[i+4];
15505 (out+j)[5] = (uint)in[i+5];
15516 const uint* restrict in = (const uint* restrict)_in;
15517 uint * restrict out = (uint* restrict)_out;
15523 (out+j)[0] = (uint)in[i+0];
15524 (out+j)[1] = (uint)in[i+1];
15525 (out+j)[2] = (uint)in[i+2];
15526 (out+j)[3] = (uint)in[i+3];
15527 (out+j)[4] = (uint)in[i+4];
15528 (out+j)[5] = (uint)in[i+5];
15531 (out+j)[0] = (uint)in[i+2];
15532 (out+j)[1] = (uint)in[i-2];
15533 (out+j)[2] = (uint)in[i+0];
15534 (out+j)[3] = (uint)in[i+3];
15535 (out+j)[4] = (uint)in[i+4];
15536 (out+j)[5] = (uint)in[i+6];
15548 const uint* restrict in = (const uint* restrict)_in;
15549 uint * restrict out = (uint* restrict)_out;
15553 (out+j)[0] = (uint)in[i];
15564 const uint* restrict in = (const uint* restrict)_in;
15565 uint * restrict out = (uint* restrict)_out;
15569 (out+j)[0] = (uint)in[i];
15570 (out+j)[1] = (uint)in[i+1];
15581 const uint* restrict in = (const uint* restrict)_in;
15582 uint * restrict out = (uint* restrict)_out;
15586 (out+j)[0] = (uint)in[i];
15587 (out+j)[1] = (uint)in[i+1];
15598 const uint* restrict in = (const uint* restrict)_in;
15599 uint * restrict out = (uint* restrict)_out;
15612 (out+j)[0] = (uint)in[end];
15613 (out+j)[1] = (uint)in[start];
15621 (out+j)[0] = (uint)in[end];
15622 (out+j)[1] = (uint)in[start];
15628 (out+j)[0] = (uint)in[i];
15629 (out+j)[1] = (uint)in[i+1];
15632 (out+j)[0] = (uint)in[end];
15633 (out+j)[1] = (uint)in[start];
15643 const uint* restrict in = (const uint* restrict)_in;
15644 uint * restrict out = (uint* restrict)_out;
15648 (out+j)[0] = (uint)in[i];
15649 (out+j)[1] = (uint)in[i+1];
15650 (out+j)[2] = (uint)in[i+2];
15661 const uint* restrict in = (const uint* restrict)_in;
15662 uint * restrict out = (uint* restrict)_out;
15666 (out+j)[0] = (uint)in[i];
15667 (out+j)[1] = (uint)in[i+1+(i&1)];
15668 (out+j)[2] = (uint)in[i+2-(i&1)];
15679 const uint* restrict in = (const uint* restrict)_in;
15680 uint * restrict out = (uint* restrict)_out;
15706 (out+j)[0] = (uint)in[i+1];
15707 (out+j)[1] = (uint)in[i+2];
15708 (out+j)[2] = (uint)in[start];
15719 const uint* restrict in = (const uint* restrict)_in;
15720 uint * restrict out = (uint* restrict)_out;
15750 (out+j+0)[0] = (uint)in[i+0];
15751 (out+j+0)[1] = (uint)in[i+1];
15752 (out+j+0)[2] = (uint)in[i+2];
15753 (out+j+3)[0] = (uint)in[i+0];
15754 (out+j+3)[1] = (uint)in[i+2];
15755 (out+j+3)[2] = (uint)in[i+3];
15766 const uint* restrict in = (const uint* restrict)_in;
15767 uint * restrict out = (uint* restrict)_out;
15797 (out+j+0)[0] = (uint)in[i+0];
15798 (out+j+0)[1] = (uint)in[i+1];
15799 (out+j+0)[2] = (uint)in[i+3];
15800 (out+j+3)[0] = (uint)in[i+0];
15801 (out+j+3)[1] = (uint)in[i+3];
15802 (out+j+3)[2] = (uint)in[i+2];
15813 const uint* restrict in = (const uint* restrict)_in;
15814 uint * restrict out = (uint* restrict)_out;
15840 (out+j)[0] = (uint)in[start];
15841 (out+j)[1] = (uint)in[i+1];
15842 (out+j)[2] = (uint)in[i+2];
15853 const uint* restrict in = (const uint* restrict)_in;
15854 uint * restrict out = (uint* restrict)_out;
15858 (out+j)[0] = (uint)in[i+0];
15859 (out+j)[1] = (uint)in[i+1];
15860 (out+j)[2] = (uint)in[i+2];
15861 (out+j)[3] = (uint)in[i+3];
15872 const uint* restrict in = (const uint* restrict)_in;
15873 uint * restrict out = (uint* restrict)_out;
15877 (out+j)[0] = (uint)in[i+0];
15878 (out+j)[1] = (uint)in[i+1];
15879 (out+j)[2] = (uint)in[i+2];
15880 (out+j)[3] = (uint)in[i+3];
15891 const uint* restrict in = (const uint* restrict)_in;
15892 uint * restrict out = (uint* restrict)_out;
15896 (out+j)[0] = (uint)in[i+0];
15897 (out+j)[1] = (uint)in[i+1];
15898 (out+j)[2] = (uint)in[i+2];
15899 (out+j)[3] = (uint)in[i+3];
15900 (out+j)[4] = (uint)in[i+4];
15901 (out+j)[5] = (uint)in[i+5];
15912 const uint* restrict in = (const uint* restrict)_in;
15913 uint * restrict out = (uint* restrict)_out;
15919 (out+j)[0] = (uint)in[i+0];
15920 (out+j)[1] = (uint)in[i+1];
15921 (out+j)[2] = (uint)in[i+2];
15922 (out+j)[3] = (uint)in[i+3];
15923 (out+j)[4] = (uint)in[i+4];
15924 (out+j)[5] = (uint)in[i+5];
15927 (out+j)[0] = (uint)in[i+2];
15928 (out+j)[1] = (uint)in[i-2];
15929 (out+j)[2] = (uint)in[i+0];
15930 (out+j)[3] = (uint)in[i+3];
15931 (out+j)[4] = (uint)in[i+4];
15932 (out+j)[5] = (uint)in[i+6];
15944 const uint* restrict in = (const uint* restrict)_in;
15945 uint * restrict out = (uint* restrict)_out;
15949 (out+j)[0] = (uint)in[i];
15960 const uint* restrict in = (const uint* restrict)_in;
15961 uint * restrict out = (uint* restrict)_out;
15965 (out+j)[0] = (uint)in[i+1];
15966 (out+j)[1] = (uint)in[i];
15977 const uint* restrict in = (const uint* restrict)_in;
15978 uint * restrict out = (uint* restrict)_out;
15982 (out+j)[0] = (uint)in[i+1];
15983 (out+j)[1] = (uint)in[i];
15994 const uint* restrict in = (const uint* restrict)_in;
15995 uint * restrict out = (uint* restrict)_out;
16000 (out+j)[0] = (uint)in[i+1];
16001 (out+j)[1] = (uint)in[i];
16004 (out+j)[0] = (uint)in[start];
16005 (out+j)[1] = (uint)in[end];
16015 const uint* restrict in = (const uint* restrict)_in;
16016 uint * restrict out = (uint* restrict)_out;
16020 (out+j)[0] = (uint)in[i+1];
16021 (out+j)[1] = (uint)in[i+2];
16022 (out+j)[2] = (uint)in[i];
16033 const uint* restrict in = (const uint* restrict)_in;
16034 uint * restrict out = (uint* restrict)_out;
16038 (out+j)[0] = (uint)in[i+1+(i&1)];
16039 (out+j)[1] = (uint)in[i+2-(i&1)];
16040 (out+j)[2] = (uint)in[i];
16051 const uint* restrict in = (const uint* restrict)_in;
16052 uint * restrict out = (uint* restrict)_out;
16056 (out+j)[0] = (uint)in[i+2];
16057 (out+j)[1] = (uint)in[start];
16058 (out+j)[2] = (uint)in[i+1];
16069 const uint* restrict in = (const uint* restrict)_in;
16070 uint * restrict out = (uint* restrict)_out;
16074 (out+j+0)[0] = (uint)in[i+1];
16075 (out+j+0)[1] = (uint)in[i+2];
16076 (out+j+0)[2] = (uint)in[i+0];
16077 (out+j+3)[0] = (uint)in[i+2];
16078 (out+j+3)[1] = (uint)in[i+3];
16079 (out+j+3)[2] = (uint)in[i+0];
16090 const uint* restrict in = (const uint* restrict)_in;
16091 uint * restrict out = (uint* restrict)_out;
16095 (out+j+0)[0] = (uint)in[i+1];
16096 (out+j+0)[1] = (uint)in[i+3];
16097 (out+j+0)[2] = (uint)in[i+0];
16098 (out+j+3)[0] = (uint)in[i+3];
16099 (out+j+3)[1] = (uint)in[i+2];
16100 (out+j+3)[2] = (uint)in[i+0];
16111 const uint* restrict in = (const uint* restrict)_in;
16112 uint * restrict out = (uint* restrict)_out;
16116 (out+j)[0] = (uint)in[i+1];
16117 (out+j)[1] = (uint)in[i+2];
16118 (out+j)[2] = (uint)in[start];
16129 const uint* restrict in = (const uint* restrict)_in;
16130 uint * restrict out = (uint* restrict)_out;
16134 (out+j)[0] = (uint)in[i+3];
16135 (out+j)[1] = (uint)in[i+2];
16136 (out+j)[2] = (uint)in[i+1];
16137 (out+j)[3] = (uint)in[i+0];
16148 const uint* restrict in = (const uint* restrict)_in;
16149 uint * restrict out = (uint* restrict)_out;
16153 (out+j)[0] = (uint)in[i+3];
16154 (out+j)[1] = (uint)in[i+2];
16155 (out+j)[2] = (uint)in[i+1];
16156 (out+j)[3] = (uint)in[i+0];
16167 const uint* restrict in = (const uint* restrict)_in;
16168 uint * restrict out = (uint* restrict)_out;
16172 (out+j)[0] = (uint)in[i+4];
16173 (out+j)[1] = (uint)in[i+5];
16174 (out+j)[2] = (uint)in[i+0];
16175 (out+j)[3] = (uint)in[i+1];
16176 (out+j)[4] = (uint)in[i+2];
16177 (out+j)[5] = (uint)in[i+3];
16188 const uint* restrict in = (const uint* restrict)_in;
16189 uint * restrict out = (uint* restrict)_out;
16195 (out+j)[0] = (uint)in[i+4];
16196 (out+j)[1] = (uint)in[i+5];
16197 (out+j)[2] = (uint)in[i+0];
16198 (out+j)[3] = (uint)in[i+1];
16199 (out+j)[4] = (uint)in[i+2];
16200 (out+j)[5] = (uint)in[i+3];
16203 (out+j)[0] = (uint)in[i+4];
16204 (out+j)[1] = (uint)in[i+6];
16205 (out+j)[2] = (uint)in[i+2];
16206 (out+j)[3] = (uint)in[i-2];
16207 (out+j)[4] = (uint)in[i+0];
16208 (out+j)[5] = (uint)in[i+3];
16220 const uint* restrict in = (const uint* restrict)_in;
16221 uint * restrict out = (uint* restrict)_out;
16225 (out+j)[0] = (uint)in[i];
16236 const uint* restrict in = (const uint* restrict)_in;
16237 uint * restrict out = (uint* restrict)_out;
16241 (out+j)[0] = (uint)in[i+1];
16242 (out+j)[1] = (uint)in[i];
16253 const uint* restrict in = (const uint* restrict)_in;
16254 uint * restrict out = (uint* restrict)_out;
16258 (out+j)[0] = (uint)in[i+1];
16259 (out+j)[1] = (uint)in[i];
16270 const uint* restrict in = (const uint* restrict)_in;
16271 uint * restrict out = (uint* restrict)_out;
16284 (out+j)[0] = (uint)in[start];
16285 (out+j)[1] = (uint)in[end];
16293 (out+j)[0] = (uint)in[start];
16294 (out+j)[1] = (uint)in[end];
16300 (out+j)[0] = (uint)in[i+1];
16301 (out+j)[1] = (uint)in[i];
16304 (out+j)[0] = (uint)in[start];
16305 (out+j)[1] = (uint)in[end];
16315 const uint* restrict in = (const uint* restrict)_in;
16316 uint * restrict out = (uint* restrict)_out;
16320 (out+j)[0] = (uint)in[i+1];
16321 (out+j)[1] = (uint)in[i+2];
16322 (out+j)[2] = (uint)in[i];
16333 const uint* restrict in = (const uint* restrict)_in;
16334 uint * restrict out = (uint* restrict)_out;
16338 (out+j)[0] = (uint)in[i+1+(i&1)];
16339 (out+j)[1] = (uint)in[i+2-(i&1)];
16340 (out+j)[2] = (uint)in[i];
16351 const uint* restrict in = (const uint* restrict)_in;
16352 uint * restrict out = (uint* restrict)_out;
16378 (out+j)[0] = (uint)in[i+2];
16379 (out+j)[1] = (uint)in[start];
16380 (out+j)[2] = (uint)in[i+1];
16391 const uint* restrict in = (const uint* restrict)_in;
16392 uint * restrict out = (uint* restrict)_out;
16422 (out+j+0)[0] = (uint)in[i+1];
16423 (out+j+0)[1] = (uint)in[i+2];
16424 (out+j+0)[2] = (uint)in[i+0];
16425 (out+j+3)[0] = (uint)in[i+2];
16426 (out+j+3)[1] = (uint)in[i+3];
16427 (out+j+3)[2] = (uint)in[i+0];
16438 const uint* restrict in = (const uint* restrict)_in;
16439 uint * restrict out = (uint* restrict)_out;
16469 (out+j+0)[0] = (uint)in[i+1];
16470 (out+j+0)[1] = (uint)in[i+3];
16471 (out+j+0)[2] = (uint)in[i+0];
16472 (out+j+3)[0] = (uint)in[i+3];
16473 (out+j+3)[1] = (uint)in[i+2];
16474 (out+j+3)[2] = (uint)in[i+0];
16485 const uint* restrict in = (const uint* restrict)_in;
16486 uint * restrict out = (uint* restrict)_out;
16512 (out+j)[0] = (uint)in[i+1];
16513 (out+j)[1] = (uint)in[i+2];
16514 (out+j)[2] = (uint)in[start];
16525 const uint* restrict in = (const uint* restrict)_in;
16526 uint * restrict out = (uint* restrict)_out;
16530 (out+j)[0] = (uint)in[i+3];
16531 (out+j)[1] = (uint)in[i+2];
16532 (out+j)[2] = (uint)in[i+1];
16533 (out+j)[3] = (uint)in[i+0];
16544 const uint* restrict in = (const uint* restrict)_in;
16545 uint * restrict out = (uint* restrict)_out;
16549 (out+j)[0] = (uint)in[i+3];
16550 (out+j)[1] = (uint)in[i+2];
16551 (out+j)[2] = (uint)in[i+1];
16552 (out+j)[3] = (uint)in[i+0];
16563 const uint* restrict in = (const uint* restrict)_in;
16564 uint * restrict out = (uint* restrict)_out;
16568 (out+j)[0] = (uint)in[i+4];
16569 (out+j)[1] = (uint)in[i+5];
16570 (out+j)[2] = (uint)in[i+0];
16571 (out+j)[3] = (uint)in[i+1];
16572 (out+j)[4] = (uint)in[i+2];
16573 (out+j)[5] = (uint)in[i+3];
16584 const uint* restrict in = (const uint* restrict)_in;
16585 uint * restrict out = (uint* restrict)_out;
16591 (out+j)[0] = (uint)in[i+4];
16592 (out+j)[1] = (uint)in[i+5];
16593 (out+j)[2] = (uint)in[i+0];
16594 (out+j)[3] = (uint)in[i+1];
16595 (out+j)[4] = (uint)in[i+2];
16596 (out+j)[5] = (uint)in[i+3];
16599 (out+j)[0] = (uint)in[i+4];
16600 (out+j)[1] = (uint)in[i+6];
16601 (out+j)[2] = (uint)in[i+2];
16602 (out+j)[3] = (uint)in[i-2];
16603 (out+j)[4] = (uint)in[i+0];
16604 (out+j)[5] = (uint)in[i+3];
16616 const uint* restrict in = (const uint* restrict)_in;
16617 uint * restrict out = (uint* restrict)_out;
16621 (out+j)[0] = (uint)in[i];
16632 const uint* restrict in = (const uint* restrict)_in;
16633 uint * restrict out = (uint* restrict)_out;
16637 (out+j)[0] = (uint)in[i+1];
16638 (out+j)[1] = (uint)in[i];
16649 const uint* restrict in = (const uint* restrict)_in;
16650 uint * restrict out = (uint* restrict)_out;
16654 (out+j)[0] = (uint)in[i+1];
16655 (out+j)[1] = (uint)in[i];
16666 const uint* restrict in = (const uint* restrict)_in;
16667 uint * restrict out = (uint* restrict)_out;
16672 (out+j)[0] = (uint)in[i+1];
16673 (out+j)[1] = (uint)in[i];
16676 (out+j)[0] = (uint)in[start];
16677 (out+j)[1] = (uint)in[end];
16687 const uint* restrict in = (const uint* restrict)_in;
16688 uint * restrict out = (uint* restrict)_out;
16692 (out+j)[0] = (uint)in[i+2];
16693 (out+j)[1] = (uint)in[i];
16694 (out+j)[2] = (uint)in[i+1];
16705 const uint* restrict in = (const uint* restrict)_in;
16706 uint * restrict out = (uint* restrict)_out;
16710 (out+j)[0] = (uint)in[i+2];
16711 (out+j)[1] = (uint)in[i+(i&1)];
16712 (out+j)[2] = (uint)in[i+1-(i&1)];
16723 const uint* restrict in = (const uint* restrict)_in;
16724 uint * restrict out = (uint* restrict)_out;
16728 (out+j)[0] = (uint)in[i+2];
16729 (out+j)[1] = (uint)in[start];
16730 (out+j)[2] = (uint)in[i+1];
16741 const uint* restrict in = (const uint* restrict)_in;
16742 uint * restrict out = (uint* restrict)_out;
16746 (out+j+0)[0] = (uint)in[i+3];
16747 (out+j+0)[1] = (uint)in[i+0];
16748 (out+j+0)[2] = (uint)in[i+1];
16749 (out+j+3)[0] = (uint)in[i+3];
16750 (out+j+3)[1] = (uint)in[i+1];
16751 (out+j+3)[2] = (uint)in[i+2];
16762 const uint* restrict in = (const uint* restrict)_in;
16763 uint * restrict out = (uint* restrict)_out;
16767 (out+j+0)[0] = (uint)in[i+3];
16768 (out+j+0)[1] = (uint)in[i+2];
16769 (out+j+0)[2] = (uint)in[i+0];
16770 (out+j+3)[0] = (uint)in[i+3];
16771 (out+j+3)[1] = (uint)in[i+0];
16772 (out+j+3)[2] = (uint)in[i+1];
16783 const uint* restrict in = (const uint* restrict)_in;
16784 uint * restrict out = (uint* restrict)_out;
16788 (out+j)[0] = (uint)in[start];
16789 (out+j)[1] = (uint)in[i+1];
16790 (out+j)[2] = (uint)in[i+2];
16801 const uint* restrict in = (const uint* restrict)_in;
16802 uint * restrict out = (uint* restrict)_out;
16806 (out+j)[0] = (uint)in[i+3];
16807 (out+j)[1] = (uint)in[i+2];
16808 (out+j)[2] = (uint)in[i+1];
16809 (out+j)[3] = (uint)in[i+0];
16820 const uint* restrict in = (const uint* restrict)_in;
16821 uint * restrict out = (uint* restrict)_out;
16825 (out+j)[0] = (uint)in[i+3];
16826 (out+j)[1] = (uint)in[i+2];
16827 (out+j)[2] = (uint)in[i+1];
16828 (out+j)[3] = (uint)in[i+0];
16839 const uint* restrict in = (const uint* restrict)_in;
16840 uint * restrict out = (uint* restrict)_out;
16844 (out+j)[0] = (uint)in[i+4];
16845 (out+j)[1] = (uint)in[i+5];
16846 (out+j)[2] = (uint)in[i+0];
16847 (out+j)[3] = (uint)in[i+1];
16848 (out+j)[4] = (uint)in[i+2];
16849 (out+j)[5] = (uint)in[i+3];
16860 const uint* restrict in = (const uint* restrict)_in;
16861 uint * restrict out = (uint* restrict)_out;
16867 (out+j)[0] = (uint)in[i+4];
16868 (out+j)[1] = (uint)in[i+5];
16869 (out+j)[2] = (uint)in[i+0];
16870 (out+j)[3] = (uint)in[i+1];
16871 (out+j)[4] = (uint)in[i+2];
16872 (out+j)[5] = (uint)in[i+3];
16875 (out+j)[0] = (uint)in[i+4];
16876 (out+j)[1] = (uint)in[i+6];
16877 (out+j)[2] = (uint)in[i+2];
16878 (out+j)[3] = (uint)in[i-2];
16879 (out+j)[4] = (uint)in[i+0];
16880 (out+j)[5] = (uint)in[i+3];
16892 const uint* restrict in = (const uint* restrict)_in;
16893 uint * restrict out = (uint* restrict)_out;
16897 (out+j)[0] = (uint)in[i];
16908 const uint* restrict in = (const uint* restrict)_in;
16909 uint * restrict out = (uint* restrict)_out;
16913 (out+j)[0] = (uint)in[i+1];
16914 (out+j)[1] = (uint)in[i];
16925 const uint* restrict in = (const uint* restrict)_in;
16926 uint * restrict out = (uint* restrict)_out;
16930 (out+j)[0] = (uint)in[i+1];
16931 (out+j)[1] = (uint)in[i];
16942 const uint* restrict in = (const uint* restrict)_in;
16943 uint * restrict out = (uint* restrict)_out;
16956 (out+j)[0] = (uint)in[start];
16957 (out+j)[1] = (uint)in[end];
16965 (out+j)[0] = (uint)in[start];
16966 (out+j)[1] = (uint)in[end];
16972 (out+j)[0] = (uint)in[i+1];
16973 (out+j)[1] = (uint)in[i];
16976 (out+j)[0] = (uint)in[start];
16977 (out+j)[1] = (uint)in[end];
16987 const uint* restrict in = (const uint* restrict)_in;
16988 uint * restrict out = (uint* restrict)_out;
16992 (out+j)[0] = (uint)in[i+2];
16993 (out+j)[1] = (uint)in[i];
16994 (out+j)[2] = (uint)in[i+1];
17005 const uint* restrict in = (const uint* restrict)_in;
17006 uint * restrict out = (uint* restrict)_out;
17010 (out+j)[0] = (uint)in[i+2];
17011 (out+j)[1] = (uint)in[i+(i&1)];
17012 (out+j)[2] = (uint)in[i+1-(i&1)];
17023 const uint* restrict in = (const uint* restrict)_in;
17024 uint * restrict out = (uint* restrict)_out;
17050 (out+j)[0] = (uint)in[i+2];
17051 (out+j)[1] = (uint)in[start];
17052 (out+j)[2] = (uint)in[i+1];
17063 const uint* restrict in = (const uint* restrict)_in;
17064 uint * restrict out = (uint* restrict)_out;
17094 (out+j+0)[0] = (uint)in[i+3];
17095 (out+j+0)[1] = (uint)in[i+0];
17096 (out+j+0)[2] = (uint)in[i+1];
17097 (out+j+3)[0] = (uint)in[i+3];
17098 (out+j+3)[1] = (uint)in[i+1];
17099 (out+j+3)[2] = (uint)in[i+2];
17110 const uint* restrict in = (const uint* restrict)_in;
17111 uint * restrict out = (uint* restrict)_out;
17141 (out+j+0)[0] = (uint)in[i+3];
17142 (out+j+0)[1] = (uint)in[i+2];
17143 (out+j+0)[2] = (uint)in[i+0];
17144 (out+j+3)[0] = (uint)in[i+3];
17145 (out+j+3)[1] = (uint)in[i+0];
17146 (out+j+3)[2] = (uint)in[i+1];
17157 const uint* restrict in = (const uint* restrict)_in;
17158 uint * restrict out = (uint* restrict)_out;
17184 (out+j)[0] = (uint)in[start];
17185 (out+j)[1] = (uint)in[i+1];
17186 (out+j)[2] = (uint)in[i+2];
17197 const uint* restrict in = (const uint* restrict)_in;
17198 uint * restrict out = (uint* restrict)_out;
17202 (out+j)[0] = (uint)in[i+3];
17203 (out+j)[1] = (uint)in[i+2];
17204 (out+j)[2] = (uint)in[i+1];
17205 (out+j)[3] = (uint)in[i+0];
17216 const uint* restrict in = (const uint* restrict)_in;
17217 uint * restrict out = (uint* restrict)_out;
17221 (out+j)[0] = (uint)in[i+3];
17222 (out+j)[1] = (uint)in[i+2];
17223 (out+j)[2] = (uint)in[i+1];
17224 (out+j)[3] = (uint)in[i+0];
17235 const uint* restrict in = (const uint* restrict)_in;
17236 uint * restrict out = (uint* restrict)_out;
17240 (out+j)[0] = (uint)in[i+4];
17241 (out+j)[1] = (uint)in[i+5];
17242 (out+j)[2] = (uint)in[i+0];
17243 (out+j)[3] = (uint)in[i+1];
17244 (out+j)[4] = (uint)in[i+2];
17245 (out+j)[5] = (uint)in[i+3];
17256 const uint* restrict in = (const uint* restrict)_in;
17257 uint * restrict out = (uint* restrict)_out;
17263 (out+j)[0] = (uint)in[i+4];
17264 (out+j)[1] = (uint)in[i+5];
17265 (out+j)[2] = (uint)in[i+0];
17266 (out+j)[3] = (uint)in[i+1];
17267 (out+j)[4] = (uint)in[i+2];
17268 (out+j)[5] = (uint)in[i+3];
17271 (out+j)[0] = (uint)in[i+4];
17272 (out+j)[1] = (uint)in[i+6];
17273 (out+j)[2] = (uint)in[i+2];
17274 (out+j)[3] = (uint)in[i-2];
17275 (out+j)[4] = (uint)in[i+0];
17276 (out+j)[5] = (uint)in[i+3];
17288 const uint* restrict in = (const uint* restrict)_in;
17289 uint * restrict out = (uint* restrict)_out;
17293 (out+j)[0] = (uint)in[i];
17304 const uint* restrict in = (const uint* restrict)_in;
17305 uint * restrict out = (uint* restrict)_out;
17309 (out+j)[0] = (uint)in[i];
17310 (out+j)[1] = (uint)in[i+1];
17321 const uint* restrict in = (const uint* restrict)_in;
17322 uint * restrict out = (uint* restrict)_out;
17326 (out+j)[0] = (uint)in[i];
17327 (out+j)[1] = (uint)in[i+1];
17338 const uint* restrict in = (const uint* restrict)_in;
17339 uint * restrict out = (uint* restrict)_out;
17344 (out+j)[0] = (uint)in[i];
17345 (out+j)[1] = (uint)in[i+1];
17348 (out+j)[0] = (uint)in[end];
17349 (out+j)[1] = (uint)in[start];
17359 const uint* restrict in = (const uint* restrict)_in;
17360 uint * restrict out = (uint* restrict)_out;
17364 (out+j)[0] = (uint)in[i];
17365 (out+j)[1] = (uint)in[i+1];
17366 (out+j)[2] = (uint)in[i+2];
17377 const uint* restrict in = (const uint* restrict)_in;
17378 uint * restrict out = (uint* restrict)_out;
17382 (out+j)[0] = (uint)in[i+(i&1)];
17383 (out+j)[1] = (uint)in[i+1-(i&1)];
17384 (out+j)[2] = (uint)in[i+2];
17395 const uint* restrict in = (const uint* restrict)_in;
17396 uint * restrict out = (uint* restrict)_out;
17400 (out+j)[0] = (uint)in[start];
17401 (out+j)[1] = (uint)in[i+1];
17402 (out+j)[2] = (uint)in[i+2];
17413 const uint* restrict in = (const uint* restrict)_in;
17414 uint * restrict out = (uint* restrict)_out;
17418 (out+j+0)[0] = (uint)in[i+0];
17419 (out+j+0)[1] = (uint)in[i+1];
17420 (out+j+0)[2] = (uint)in[i+3];
17421 (out+j+3)[0] = (uint)in[i+1];
17422 (out+j+3)[1] = (uint)in[i+2];
17423 (out+j+3)[2] = (uint)in[i+3];
17434 const uint* restrict in = (const uint* restrict)_in;
17435 uint * restrict out = (uint* restrict)_out;
17439 (out+j+0)[0] = (uint)in[i+2];
17440 (out+j+0)[1] = (uint)in[i+0];
17441 (out+j+0)[2] = (uint)in[i+3];
17442 (out+j+3)[0] = (uint)in[i+0];
17443 (out+j+3)[1] = (uint)in[i+1];
17444 (out+j+3)[2] = (uint)in[i+3];
17455 const uint* restrict in = (const uint* restrict)_in;
17456 uint * restrict out = (uint* restrict)_out;
17460 (out+j)[0] = (uint)in[i+1];
17461 (out+j)[1] = (uint)in[i+2];
17462 (out+j)[2] = (uint)in[start];
17473 const uint* restrict in = (const uint* restrict)_in;
17474 uint * restrict out = (uint* restrict)_out;
17478 (out+j)[0] = (uint)in[i+0];
17479 (out+j)[1] = (uint)in[i+1];
17480 (out+j)[2] = (uint)in[i+2];
17481 (out+j)[3] = (uint)in[i+3];
17492 const uint* restrict in = (const uint* restrict)_in;
17493 uint * restrict out = (uint* restrict)_out;
17497 (out+j)[0] = (uint)in[i+0];
17498 (out+j)[1] = (uint)in[i+1];
17499 (out+j)[2] = (uint)in[i+2];
17500 (out+j)[3] = (uint)in[i+3];
17511 const uint* restrict in = (const uint* restrict)_in;
17512 uint * restrict out = (uint* restrict)_out;
17516 (out+j)[0] = (uint)in[i+0];
17517 (out+j)[1] = (uint)in[i+1];
17518 (out+j)[2] = (uint)in[i+2];
17519 (out+j)[3] = (uint)in[i+3];
17520 (out+j)[4] = (uint)in[i+4];
17521 (out+j)[5] = (uint)in[i+5];
17532 const uint* restrict in = (const uint* restrict)_in;
17533 uint * restrict out = (uint* restrict)_out;
17539 (out+j)[0] = (uint)in[i+0];
17540 (out+j)[1] = (uint)in[i+1];
17541 (out+j)[2] = (uint)in[i+2];
17542 (out+j)[3] = (uint)in[i+3];
17543 (out+j)[4] = (uint)in[i+4];
17544 (out+j)[5] = (uint)in[i+5];
17547 (out+j)[0] = (uint)in[i+2];
17548 (out+j)[1] = (uint)in[i-2];
17549 (out+j)[2] = (uint)in[i+0];
17550 (out+j)[3] = (uint)in[i+3];
17551 (out+j)[4] = (uint)in[i+4];
17552 (out+j)[5] = (uint)in[i+6];
17564 const uint* restrict in = (const uint* restrict)_in;
17565 uint * restrict out = (uint* restrict)_out;
17569 (out+j)[0] = (uint)in[i];
17580 const uint* restrict in = (const uint* restrict)_in;
17581 uint * restrict out = (uint* restrict)_out;
17585 (out+j)[0] = (uint)in[i];
17586 (out+j)[1] = (uint)in[i+1];
17597 const uint* restrict in = (const uint* restrict)_in;
17598 uint * restrict out = (uint* restrict)_out;
17602 (out+j)[0] = (uint)in[i];
17603 (out+j)[1] = (uint)in[i+1];
17614 const uint* restrict in = (const uint* restrict)_in;
17615 uint * restrict out = (uint* restrict)_out;
17628 (out+j)[0] = (uint)in[end];
17629 (out+j)[1] = (uint)in[start];
17637 (out+j)[0] = (uint)in[end];
17638 (out+j)[1] = (uint)in[start];
17644 (out+j)[0] = (uint)in[i];
17645 (out+j)[1] = (uint)in[i+1];
17648 (out+j)[0] = (uint)in[end];
17649 (out+j)[1] = (uint)in[start];
17659 const uint* restrict in = (const uint* restrict)_in;
17660 uint * restrict out = (uint* restrict)_out;
17664 (out+j)[0] = (uint)in[i];
17665 (out+j)[1] = (uint)in[i+1];
17666 (out+j)[2] = (uint)in[i+2];
17677 const uint* restrict in = (const uint* restrict)_in;
17678 uint * restrict out = (uint* restrict)_out;
17682 (out+j)[0] = (uint)in[i+(i&1)];
17683 (out+j)[1] = (uint)in[i+1-(i&1)];
17684 (out+j)[2] = (uint)in[i+2];
17695 const uint* restrict in = (const uint* restrict)_in;
17696 uint * restrict out = (uint* restrict)_out;
17722 (out+j)[0] = (uint)in[start];
17723 (out+j)[1] = (uint)in[i+1];
17724 (out+j)[2] = (uint)in[i+2];
17735 const uint* restrict in = (const uint* restrict)_in;
17736 uint * restrict out = (uint* restrict)_out;
17766 (out+j+0)[0] = (uint)in[i+0];
17767 (out+j+0)[1] = (uint)in[i+1];
17768 (out+j+0)[2] = (uint)in[i+3];
17769 (out+j+3)[0] = (uint)in[i+1];
17770 (out+j+3)[1] = (uint)in[i+2];
17771 (out+j+3)[2] = (uint)in[i+3];
17782 const uint* restrict in = (const uint* restrict)_in;
17783 uint * restrict out = (uint* restrict)_out;
17813 (out+j+0)[0] = (uint)in[i+2];
17814 (out+j+0)[1] = (uint)in[i+0];
17815 (out+j+0)[2] = (uint)in[i+3];
17816 (out+j+3)[0] = (uint)in[i+0];
17817 (out+j+3)[1] = (uint)in[i+1];
17818 (out+j+3)[2] = (uint)in[i+3];
17829 const uint* restrict in = (const uint* restrict)_in;
17830 uint * restrict out = (uint* restrict)_out;
17856 (out+j)[0] = (uint)in[i+1];
17857 (out+j)[1] = (uint)in[i+2];
17858 (out+j)[2] = (uint)in[start];
17869 const uint* restrict in = (const uint* restrict)_in;
17870 uint * restrict out = (uint* restrict)_out;
17874 (out+j)[0] = (uint)in[i+0];
17875 (out+j)[1] = (uint)in[i+1];
17876 (out+j)[2] = (uint)in[i+2];
17877 (out+j)[3] = (uint)in[i+3];
17888 const uint* restrict in = (const uint* restrict)_in;
17889 uint * restrict out = (uint* restrict)_out;
17893 (out+j)[0] = (uint)in[i+0];
17894 (out+j)[1] = (uint)in[i+1];
17895 (out+j)[2] = (uint)in[i+2];
17896 (out+j)[3] = (uint)in[i+3];
17907 const uint* restrict in = (const uint* restrict)_in;
17908 uint * restrict out = (uint* restrict)_out;
17912 (out+j)[0] = (uint)in[i+0];
17913 (out+j)[1] = (uint)in[i+1];
17914 (out+j)[2] = (uint)in[i+2];
17915 (out+j)[3] = (uint)in[i+3];
17916 (out+j)[4] = (uint)in[i+4];
17917 (out+j)[5] = (uint)in[i+5];
17928 const uint* restrict in = (const uint* restrict)_in;
17929 uint * restrict out = (uint* restrict)_out;
17935 (out+j)[0] = (uint)in[i+0];
17936 (out+j)[1] = (uint)in[i+1];
17937 (out+j)[2] = (uint)in[i+2];
17938 (out+j)[3] = (uint)in[i+3];
17939 (out+j)[4] = (uint)in[i+4];
17940 (out+j)[5] = (uint)in[i+5];
17943 (out+j)[0] = (uint)in[i+2];
17944 (out+j)[1] = (uint)in[i-2];
17945 (out+j)[2] = (uint)in[i+0];
17946 (out+j)[3] = (uint)in[i+3];
17947 (out+j)[4] = (uint)in[i+4];
17948 (out+j)[5] = (uint)in[i+6];