Lines Matching refs:out
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
69 ushort * restrict out = (ushort* restrict)_out;
73 (out+j)[0] = (ushort)(i);
81 ushort * restrict out = (ushort* restrict)_out;
85 (out+j)[0] = (ushort)(i);
86 (out+j)[1] = (ushort)(i+1);
94 ushort * restrict out = (ushort* restrict)_out;
98 (out+j)[0] = (ushort)(i);
99 (out+j)[1] = (ushort)(i+1);
107 ushort * restrict out = (ushort* restrict)_out;
112 (out+j)[0] = (ushort)(i);
113 (out+j)[1] = (ushort)(i+1);
116 (out+j)[0] = (ushort)(end);
117 (out+j)[1] = (ushort)(start);
124 ushort * restrict out = (ushort* restrict)_out;
128 (out+j)[0] = (ushort)(i);
129 (out+j)[1] = (ushort)(i+1);
130 (out+j)[2] = (ushort)(i+2);
138 ushort * restrict out = (ushort* restrict)_out;
142 (out+j)[0] = (ushort)(i);
143 (out+j)[1] = (ushort)(i+1+(i&1));
144 (out+j)[2] = (ushort)(i+2-(i&1));
152 ushort * restrict out = (ushort* restrict)_out;
156 (out+j)[0] = (ushort)(i+1);
157 (out+j)[1] = (ushort)(i+2);
158 (out+j)[2] = (ushort)(start);
166 ushort * restrict out = (ushort* restrict)_out;
170 (out+j+0)[0] = (ushort)(i+0);
171 (out+j+0)[1] = (ushort)(i+1);
172 (out+j+0)[2] = (ushort)(i+2);
173 (out+j+3)[0] = (ushort)(i+0);
174 (out+j+3)[1] = (ushort)(i+2);
175 (out+j+3)[2] = (ushort)(i+3);
183 ushort * restrict out = (ushort* restrict)_out;
187 (out+j+0)[0] = (ushort)(i+0);
188 (out+j+0)[1] = (ushort)(i+1);
189 (out+j+0)[2] = (ushort)(i+3);
190 (out+j+3)[0] = (ushort)(i+0);
191 (out+j+3)[1] = (ushort)(i+3);
192 (out+j+3)[2] = (ushort)(i+2);
200 ushort * restrict out = (ushort* restrict)_out;
204 (out+j)[0] = (ushort)(start);
205 (out+j)[1] = (ushort)(i+1);
206 (out+j)[2] = (ushort)(i+2);
214 ushort * restrict out = (ushort* restrict)_out;
218 (out+j)[0] = (ushort)(i+0);
219 (out+j)[1] = (ushort)(i+1);
220 (out+j)[2] = (ushort)(i+2);
221 (out+j)[3] = (ushort)(i+3);
229 ushort * restrict out = (ushort* restrict)_out;
233 (out+j)[0] = (ushort)(i+0);
234 (out+j)[1] = (ushort)(i+1);
235 (out+j)[2] = (ushort)(i+2);
236 (out+j)[3] = (ushort)(i+3);
244 ushort * restrict out = (ushort* restrict)_out;
248 (out+j)[0] = (ushort)(i+0);
249 (out+j)[1] = (ushort)(i+1);
250 (out+j)[2] = (ushort)(i+2);
251 (out+j)[3] = (ushort)(i+3);
252 (out+j)[4] = (ushort)(i+4);
253 (out+j)[5] = (ushort)(i+5);
261 ushort * restrict out = (ushort* restrict)_out;
267 (out+j)[0] = (ushort)(i+0);
268 (out+j)[1] = (ushort)(i+1);
269 (out+j)[2] = (ushort)(i+2);
270 (out+j)[3] = (ushort)(i+3);
271 (out+j)[4] = (ushort)(i+4);
272 (out+j)[5] = (ushort)(i+5);
275 (out+j)[0] = (ushort)(i+2);
276 (out+j)[1] = (ushort)(i-2);
277 (out+j)[2] = (ushort)(i+0);
278 (out+j)[3] = (ushort)(i+3);
279 (out+j)[4] = (ushort)(i+4);
280 (out+j)[5] = (ushort)(i+6);
289 ushort * restrict out = (ushort* restrict)_out;
293 (out+j)[0] = (ushort)(i);
301 ushort * restrict out = (ushort* restrict)_out;
305 (out+j)[0] = (ushort)(i+1);
306 (out+j)[1] = (ushort)(i);
314 ushort * restrict out = (ushort* restrict)_out;
318 (out+j)[0] = (ushort)(i+1);
319 (out+j)[1] = (ushort)(i);
327 ushort * restrict out = (ushort* restrict)_out;
332 (out+j)[0] = (ushort)(i+1);
333 (out+j)[1] = (ushort)(i);
336 (out+j)[0] = (ushort)(start);
337 (out+j)[1] = (ushort)(end);
344 ushort * restrict out = (ushort* restrict)_out;
348 (out+j)[0] = (ushort)(i+1);
349 (out+j)[1] = (ushort)(i+2);
350 (out+j)[2] = (ushort)(i);
358 ushort * restrict out = (ushort* restrict)_out;
362 (out+j)[0] = (ushort)(i+1+(i&1));
363 (out+j)[1] = (ushort)(i+2-(i&1));
364 (out+j)[2] = (ushort)(i);
372 ushort * restrict out = (ushort* restrict)_out;
376 (out+j)[0] = (ushort)(i+2);
377 (out+j)[1] = (ushort)(start);
378 (out+j)[2] = (ushort)(i+1);
386 ushort * restrict out = (ushort* restrict)_out;
390 (out+j+0)[0] = (ushort)(i+1);
391 (out+j+0)[1] = (ushort)(i+2);
392 (out+j+0)[2] = (ushort)(i+0);
393 (out+j+3)[0] = (ushort)(i+2);
394 (out+j+3)[1] = (ushort)(i+3);
395 (out+j+3)[2] = (ushort)(i+0);
403 ushort * restrict out = (ushort* restrict)_out;
407 (out+j+0)[0] = (ushort)(i+1);
408 (out+j+0)[1] = (ushort)(i+3);
409 (out+j+0)[2] = (ushort)(i+0);
410 (out+j+3)[0] = (ushort)(i+3);
411 (out+j+3)[1] = (ushort)(i+2);
412 (out+j+3)[2] = (ushort)(i+0);
420 ushort * restrict out = (ushort* restrict)_out;
424 (out+j)[0] = (ushort)(i+1);
425 (out+j)[1] = (ushort)(i+2);
426 (out+j)[2] = (ushort)(start);
434 ushort * restrict out = (ushort* restrict)_out;
438 (out+j)[0] = (ushort)(i+3);
439 (out+j)[1] = (ushort)(i+2);
440 (out+j)[2] = (ushort)(i+1);
441 (out+j)[3] = (ushort)(i+0);
449 ushort * restrict out = (ushort* restrict)_out;
453 (out+j)[0] = (ushort)(i+3);
454 (out+j)[1] = (ushort)(i+2);
455 (out+j)[2] = (ushort)(i+1);
456 (out+j)[3] = (ushort)(i+0);
464 ushort * restrict out = (ushort* restrict)_out;
468 (out+j)[0] = (ushort)(i+4);
469 (out+j)[1] = (ushort)(i+5);
470 (out+j)[2] = (ushort)(i+0);
471 (out+j)[3] = (ushort)(i+1);
472 (out+j)[4] = (ushort)(i+2);
473 (out+j)[5] = (ushort)(i+3);
481 ushort * restrict out = (ushort* restrict)_out;
487 (out+j)[0] = (ushort)(i+4);
488 (out+j)[1] = (ushort)(i+5);
489 (out+j)[2] = (ushort)(i+0);
490 (out+j)[3] = (ushort)(i+1);
491 (out+j)[4] = (ushort)(i+2);
492 (out+j)[5] = (ushort)(i+3);
495 (out+j)[0] = (ushort)(i+4);
496 (out+j)[1] = (ushort)(i+6);
497 (out+j)[2] = (ushort)(i+2);
498 (out+j)[3] = (ushort)(i-2);
499 (out+j)[4] = (ushort)(i+0);
500 (out+j)[5] = (ushort)(i+3);
509 ushort * restrict out = (ushort* restrict)_out;
513 (out+j)[0] = (ushort)(i);
521 ushort * restrict out = (ushort* restrict)_out;
525 (out+j)[0] = (ushort)(i+1);
526 (out+j)[1] = (ushort)(i);
534 ushort * restrict out = (ushort* restrict)_out;
538 (out+j)[0] = (ushort)(i+1);
539 (out+j)[1] = (ushort)(i);
547 ushort * restrict out = (ushort* restrict)_out;
552 (out+j)[0] = (ushort)(i+1);
553 (out+j)[1] = (ushort)(i);
556 (out+j)[0] = (ushort)(start);
557 (out+j)[1] = (ushort)(end);
564 ushort * restrict out = (ushort* restrict)_out;
568 (out+j)[0] = (ushort)(i+2);
569 (out+j)[1] = (ushort)(i);
570 (out+j)[2] = (ushort)(i+1);
578 ushort * restrict out = (ushort* restrict)_out;
582 (out+j)[0] = (ushort)(i+2);
583 (out+j)[1] = (ushort)(i+(i&1));
584 (out+j)[2] = (ushort)(i+1-(i&1));
592 ushort * restrict out = (ushort* restrict)_out;
596 (out+j)[0] = (ushort)(i+2);
597 (out+j)[1] = (ushort)(start);
598 (out+j)[2] = (ushort)(i+1);
606 ushort * restrict out = (ushort* restrict)_out;
610 (out+j+0)[0] = (ushort)(i+3);
611 (out+j+0)[1] = (ushort)(i+0);
612 (out+j+0)[2] = (ushort)(i+1);
613 (out+j+3)[0] = (ushort)(i+3);
614 (out+j+3)[1] = (ushort)(i+1);
615 (out+j+3)[2] = (ushort)(i+2);
623 ushort * restrict out = (ushort* restrict)_out;
627 (out+j+0)[0] = (ushort)(i+3);
628 (out+j+0)[1] = (ushort)(i+2);
629 (out+j+0)[2] = (ushort)(i+0);
630 (out+j+3)[0] = (ushort)(i+3);
631 (out+j+3)[1] = (ushort)(i+0);
632 (out+j+3)[2] = (ushort)(i+1);
640 ushort * restrict out = (ushort* restrict)_out;
644 (out+j)[0] = (ushort)(start);
645 (out+j)[1] = (ushort)(i+1);
646 (out+j)[2] = (ushort)(i+2);
654 ushort * restrict out = (ushort* restrict)_out;
658 (out+j)[0] = (ushort)(i+3);
659 (out+j)[1] = (ushort)(i+2);
660 (out+j)[2] = (ushort)(i+1);
661 (out+j)[3] = (ushort)(i+0);
669 ushort * restrict out = (ushort* restrict)_out;
673 (out+j)[0] = (ushort)(i+3);
674 (out+j)[1] = (ushort)(i+2);
675 (out+j)[2] = (ushort)(i+1);
676 (out+j)[3] = (ushort)(i+0);
684 ushort * restrict out = (ushort* restrict)_out;
688 (out+j)[0] = (ushort)(i+4);
689 (out+j)[1] = (ushort)(i+5);
690 (out+j)[2] = (ushort)(i+0);
691 (out+j)[3] = (ushort)(i+1);
692 (out+j)[4] = (ushort)(i+2);
693 (out+j)[5] = (ushort)(i+3);
701 ushort * restrict out = (ushort* restrict)_out;
707 (out+j)[0] = (ushort)(i+4);
708 (out+j)[1] = (ushort)(i+5);
709 (out+j)[2] = (ushort)(i+0);
710 (out+j)[3] = (ushort)(i+1);
711 (out+j)[4] = (ushort)(i+2);
712 (out+j)[5] = (ushort)(i+3);
715 (out+j)[0] = (ushort)(i+4);
716 (out+j)[1] = (ushort)(i+6);
717 (out+j)[2] = (ushort)(i+2);
718 (out+j)[3] = (ushort)(i-2);
719 (out+j)[4] = (ushort)(i+0);
720 (out+j)[5] = (ushort)(i+3);
729 ushort * restrict out = (ushort* restrict)_out;
733 (out+j)[0] = (ushort)(i);
741 ushort * restrict out = (ushort* restrict)_out;
745 (out+j)[0] = (ushort)(i);
746 (out+j)[1] = (ushort)(i+1);
754 ushort * restrict out = (ushort* restrict)_out;
758 (out+j)[0] = (ushort)(i);
759 (out+j)[1] = (ushort)(i+1);
767 ushort * restrict out = (ushort* restrict)_out;
772 (out+j)[0] = (ushort)(i);
773 (out+j)[1] = (ushort)(i+1);
776 (out+j)[0] = (ushort)(end);
777 (out+j)[1] = (ushort)(start);
784 ushort * restrict out = (ushort* restrict)_out;
788 (out+j)[0] = (ushort)(i);
789 (out+j)[1] = (ushort)(i+1);
790 (out+j)[2] = (ushort)(i+2);
798 ushort * restrict out = (ushort* restrict)_out;
802 (out+j)[0] = (ushort)(i+(i&1));
803 (out+j)[1] = (ushort)(i+1-(i&1));
804 (out+j)[2] = (ushort)(i+2);
812 ushort * restrict out = (ushort* restrict)_out;
816 (out+j)[0] = (ushort)(start);
817 (out+j)[1] = (ushort)(i+1);
818 (out+j)[2] = (ushort)(i+2);
826 ushort * restrict out = (ushort* restrict)_out;
830 (out+j+0)[0] = (ushort)(i+0);
831 (out+j+0)[1] = (ushort)(i+1);
832 (out+j+0)[2] = (ushort)(i+3);
833 (out+j+3)[0] = (ushort)(i+1);
834 (out+j+3)[1] = (ushort)(i+2);
835 (out+j+3)[2] = (ushort)(i+3);
843 ushort * restrict out = (ushort* restrict)_out;
847 (out+j+0)[0] = (ushort)(i+2);
848 (out+j+0)[1] = (ushort)(i+0);
849 (out+j+0)[2] = (ushort)(i+3);
850 (out+j+3)[0] = (ushort)(i+0);
851 (out+j+3)[1] = (ushort)(i+1);
852 (out+j+3)[2] = (ushort)(i+3);
860 ushort * restrict out = (ushort* restrict)_out;
864 (out+j)[0] = (ushort)(i+1);
865 (out+j)[1] = (ushort)(i+2);
866 (out+j)[2] = (ushort)(start);
874 ushort * restrict out = (ushort* restrict)_out;
878 (out+j)[0] = (ushort)(i+0);
879 (out+j)[1] = (ushort)(i+1);
880 (out+j)[2] = (ushort)(i+2);
881 (out+j)[3] = (ushort)(i+3);
889 ushort * restrict out = (ushort* restrict)_out;
893 (out+j)[0] = (ushort)(i+0);
894 (out+j)[1] = (ushort)(i+1);
895 (out+j)[2] = (ushort)(i+2);
896 (out+j)[3] = (ushort)(i+3);
904 ushort * restrict out = (ushort* restrict)_out;
908 (out+j)[0] = (ushort)(i+0);
909 (out+j)[1] = (ushort)(i+1);
910 (out+j)[2] = (ushort)(i+2);
911 (out+j)[3] = (ushort)(i+3);
912 (out+j)[4] = (ushort)(i+4);
913 (out+j)[5] = (ushort)(i+5);
921 ushort * restrict out = (ushort* restrict)_out;
927 (out+j)[0] = (ushort)(i+0);
928 (out+j)[1] = (ushort)(i+1);
929 (out+j)[2] = (ushort)(i+2);
930 (out+j)[3] = (ushort)(i+3);
931 (out+j)[4] = (ushort)(i+4);
932 (out+j)[5] = (ushort)(i+5);
935 (out+j)[0] = (ushort)(i+2);
936 (out+j)[1] = (ushort)(i-2);
937 (out+j)[2] = (ushort)(i+0);
938 (out+j)[3] = (ushort)(i+3);
939 (out+j)[4] = (ushort)(i+4);
940 (out+j)[5] = (ushort)(i+6);
949 uint * restrict out = (uint* restrict)_out;
953 (out+j)[0] = (uint)(i);
961 uint * restrict out = (uint* restrict)_out;
965 (out+j)[0] = (uint)(i);
966 (out+j)[1] = (uint)(i+1);
974 uint * restrict out = (uint* restrict)_out;
978 (out+j)[0] = (uint)(i);
979 (out+j)[1] = (uint)(i+1);
987 uint * restrict out = (uint* restrict)_out;
992 (out+j)[0] = (uint)(i);
993 (out+j)[1] = (uint)(i+1);
996 (out+j)[0] = (uint)(end);
997 (out+j)[1] = (uint)(start);
1004 uint * restrict out = (uint* restrict)_out;
1008 (out+j)[0] = (uint)(i);
1009 (out+j)[1] = (uint)(i+1);
1010 (out+j)[2] = (uint)(i+2);
1018 uint * restrict out = (uint* restrict)_out;
1022 (out+j)[0] = (uint)(i);
1023 (out+j)[1] = (uint)(i+1+(i&1));
1024 (out+j)[2] = (uint)(i+2-(i&1));
1032 uint * restrict out = (uint* restrict)_out;
1036 (out+j)[0] = (uint)(i+1);
1037 (out+j)[1] = (uint)(i+2);
1038 (out+j)[2] = (uint)(start);
1046 uint * restrict out = (uint* restrict)_out;
1050 (out+j+0)[0] = (uint)(i+0);
1051 (out+j+0)[1] = (uint)(i+1);
1052 (out+j+0)[2] = (uint)(i+2);
1053 (out+j+3)[0] = (uint)(i+0);
1054 (out+j+3)[1] = (uint)(i+2);
1055 (out+j+3)[2] = (uint)(i+3);
1063 uint * restrict out = (uint* restrict)_out;
1067 (out+j+0)[0] = (uint)(i+0);
1068 (out+j+0)[1] = (uint)(i+1);
1069 (out+j+0)[2] = (uint)(i+3);
1070 (out+j+3)[0] = (uint)(i+0);
1071 (out+j+3)[1] = (uint)(i+3);
1072 (out+j+3)[2] = (uint)(i+2);
1080 uint * restrict out = (uint* restrict)_out;
1084 (out+j)[0] = (uint)(start);
1085 (out+j)[1] = (uint)(i+1);
1086 (out+j)[2] = (uint)(i+2);
1094 uint * restrict out = (uint* restrict)_out;
1098 (out+j)[0] = (uint)(i+0);
1099 (out+j)[1] = (uint)(i+1);
1100 (out+j)[2] = (uint)(i+2);
1101 (out+j)[3] = (uint)(i+3);
1109 uint * restrict out = (uint* restrict)_out;
1113 (out+j)[0] = (uint)(i+0);
1114 (out+j)[1] = (uint)(i+1);
1115 (out+j)[2] = (uint)(i+2);
1116 (out+j)[3] = (uint)(i+3);
1124 uint * restrict out = (uint* restrict)_out;
1128 (out+j)[0] = (uint)(i+0);
1129 (out+j)[1] = (uint)(i+1);
1130 (out+j)[2] = (uint)(i+2);
1131 (out+j)[3] = (uint)(i+3);
1132 (out+j)[4] = (uint)(i+4);
1133 (out+j)[5] = (uint)(i+5);
1141 uint * restrict out = (uint* restrict)_out;
1147 (out+j)[0] = (uint)(i+0);
1148 (out+j)[1] = (uint)(i+1);
1149 (out+j)[2] = (uint)(i+2);
1150 (out+j)[3] = (uint)(i+3);
1151 (out+j)[4] = (uint)(i+4);
1152 (out+j)[5] = (uint)(i+5);
1155 (out+j)[0] = (uint)(i+2);
1156 (out+j)[1] = (uint)(i-2);
1157 (out+j)[2] = (uint)(i+0);
1158 (out+j)[3] = (uint)(i+3);
1159 (out+j)[4] = (uint)(i+4);
1160 (out+j)[5] = (uint)(i+6);
1169 uint * restrict out = (uint* restrict)_out;
1173 (out+j)[0] = (uint)(i);
1181 uint * restrict out = (uint* restrict)_out;
1185 (out+j)[0] = (uint)(i+1);
1186 (out+j)[1] = (uint)(i);
1194 uint * restrict out = (uint* restrict)_out;
1198 (out+j)[0] = (uint)(i+1);
1199 (out+j)[1] = (uint)(i);
1207 uint * restrict out = (uint* restrict)_out;
1212 (out+j)[0] = (uint)(i+1);
1213 (out+j)[1] = (uint)(i);
1216 (out+j)[0] = (uint)(start);
1217 (out+j)[1] = (uint)(end);
1224 uint * restrict out = (uint* restrict)_out;
1228 (out+j)[0] = (uint)(i+1);
1229 (out+j)[1] = (uint)(i+2);
1230 (out+j)[2] = (uint)(i);
1238 uint * restrict out = (uint* restrict)_out;
1242 (out+j)[0] = (uint)(i+1+(i&1));
1243 (out+j)[1] = (uint)(i+2-(i&1));
1244 (out+j)[2] = (uint)(i);
1252 uint * restrict out = (uint* restrict)_out;
1256 (out+j)[0] = (uint)(i+2);
1257 out+j)[1] = (uint)(start);
1258 (out+j)[2] = (uint)(i+1);
1266 uint * restrict out = (uint* restrict)_out;
1270 (out+j+0)[0] = (uint)(i+1);
1271 (out+j+0)[1] = (uint)(i+2);
1272 (out+j+0)[2] = (uint)(i+0);
1273 (out+j+3)[0] = (uint)(i+2);
1274 (out+j+3)[1] = (uint)(i+3);
1275 (out+j+3)[2] = (uint)(i+0);
1283 uint * restrict out = (uint* restrict)_out;
1287 (out+j+0)[0] = (uint)(i+1);
1288 (out+j+0)[1] = (uint)(i+3);
1289 (out+j+0)[2] = (uint)(i+0);
1290 (out+j+3)[0] = (uint)(i+3);
1291 (out+j+3)[1] = (uint)(i+2);
1292 (out+j+3)[2] = (uint)(i+0);
1300 uint * restrict out = (uint* restrict)_out;
1304 (out+j)[0] = (uint)(i+1);
1305 (out+j)[1] = (uint)(i+2);
1306 (out+j)[2] = (uint)(start);
1314 uint * restrict out = (uint* restrict)_out;
1318 (out+j)[0] = (uint)(i+3);
1319 (out+j)[1] = (uint)(i+2);
1320 (out+j)[2] = (uint)(i+1);
1321 (out+j)[3] = (uint)(i+0);
1329 uint * restrict out = (uint* restrict)_out;
1333 (out+j)[0] = (uint)(i+3);
1334 (out+j)[1] = (uint)(i+2);
1335 (out+j)[2] = (uint)(i+1);
1336 (out+j)[3] = (uint)(i+0);
1344 uint * restrict out = (uint* restrict)_out;
1348 (out+j)[0] = (uint)(i+4);
1349 (out+j)[1] = (uint)(i+5);
1350 (out+j)[2] = (uint)(i+0);
1351 (out+j)[3] = (uint)(i+1);
1352 (out+j)[4] = (uint)(i+2);
1353 (out+j)[5] = (uint)(i+3);
1361 uint * restrict out = (uint* restrict)_out;
1367 (out+j)[0] = (uint)(i+4);
1368 (out+j)[1] = (uint)(i+5);
1369 (out+j)[2] = (uint)(i+0);
1370 (out+j)[3] = (uint)(i+1);
1371 (out+j)[4] = (uint)(i+2);
1372 (out+j)[5] = (uint)(i+3);
1375 (out+j)[0] = (uint)(i+4);
1376 (out+j)[1] = (uint)(i+6);
1377 (out+j)[2] = (uint)(i+2);
1378 (out+j)[3] = (uint)(i-2);
1379 (out+j)[4] = (uint)(i+0);
1380 (out+j)[5] = (uint)(i+3);
1389 uint * restrict out = (uint* restrict)_out;
1393 (out+j)[0] = (uint)(i);
1401 uint * restrict out = (uint* restrict)_out;
1405 (out+j)[0] = (uint)(i+1);
1406 (out+j)[1] = (uint)(i);
1414 uint * restrict out = (uint* restrict)_out;
1418 (out+j)[0] = (uint)(i+1);
1419 (out+j)[1] = (uint)(i);
1427 uint * restrict out = (uint* restrict)_out;
1432 (out+j)[0] = (uint)(i+1);
1433 (out+j)[1] = (uint)(i);
1436 (out+j)[0] = (uint)(start);
1437 (out+j)[1] = (uint)(end);
1444 uint * restrict out = (uint* restrict)_out;
1448 (out+j)[0] = (uint)(i+2);
1449 (out+j)[1] = (uint)(i);
1450 (out+j)[2] = (uint)(i+1);
1458 uint * restrict out = (uint* restrict)_out;
1462 (out+j)[0] = (uint)(i+2);
1463 (out+j)[1] = (uint)(i+(i&1));
1464 (out+j)[2] = (uint)(i+1-(i&1));
1472 uint * restrict out = (uint* restrict)_out;
1476 (out+j)[0] = (uint)(i+2);
1477 (out+j)[1] = (uint)(start);
1478 (out+j)[2] = (uint)(i+1);
1486 uint * restrict out = (uint* restrict)_out;
1490 (out+j+0)[0] = (uint)(i+3);
1491 (out+j+0)[1] = (uint)(i+0);
1492 (out+j+0)[2] = (uint)(i+1);
1493 (out+j+3)[0] = (uint)(i+3);
1494 (out+j+3)[1] = (uint)(i+1);
1495 (out+j+3)[2] = (uint)(i+2);
1503 uint * restrict out = (uint* restrict)_out;
1507 (out+j+0)[0] = (uint)(i+3);
1508 (out+j+0)[1] = (uint)(i+2);
1509 (out+j+0)[2] = (uint)(i+0);
1510 (out+j+3)[0] = (uint)(i+3);
1511 (out+j+3)[1] = (uint)(i+0);
1512 (out+j+3)[2] = (uint)(i+1);
1520 uint * restrict out = (uint* restrict)_out;
1524 (out+j)[0] = (uint)(start);
1525 (out+j)[1] = (uint)(i+1);
1526 (out+j)[2] = (uint)(i+2);
1534 uint * restrict out = (uint* restrict)_out;
1538 (out+j)[0] = (uint)(i+3);
1539 (out+j)[1] = (uint)(i+2);
1540 (out+j)[2] = (uint)(i+1);
1541 (out+j)[3] = (uint)(i+0);
1549 uint * restrict out = (uint* restrict)_out;
1553 (out+j)[0] = (uint)(i+3);
1554 (out+j)[1] = (uint)(i+2);
1555 (out+j)[2] = (uint)(i+1);
1556 (out+j)[3] = (uint)(i+0);
1564 uint * restrict out = (uint* restrict)_out;
1568 (out+j)[0] = (uint)(i+4);
1569 (out+j)[1] = (uint)(i+5);
1570 (out+j)[2] = (uint)(i+0);
1571 (out+j)[3] = (uint)(i+1);
1572 (out+j)[4] = (uint)(i+2);
1573 (out+j)[5] = (uint)(i+3);
1581 uint * restrict out = (uint* restrict)_out;
1587 (out+j)[0] = (uint)(i+4);
1588 (out+j)[1] = (uint)(i+5);
1589 (out+j)[2] = (uint)(i+0);
1590 (out+j)[3] = (uint)(i+1);
1591 (out+j)[4] = (uint)(i+2);
1592 (out+j)[5] = (uint)(i+3);
1595 (out+j)[0] = (uint)(i+4);
1596 (out+j)[1] = (uint)(i+6);
1597 (out+j)[2] = (uint)(i+2);
1598 (out+j)[3] = (uint)(i-2);
1599 (out+j)[4] = (uint)(i+0);
1600 (out+j)[5] = (uint)(i+3);
1609 uint * restrict out = (uint* restrict)_out;
1613 (out+j)[0] = (uint)(i);
1621 uint * restrict out = (uint* restrict)_out;
1625 (out+j)[0] = (uint)(i);
1626 (out+j)[1] = (uint)(i+1);
1634 uint * restrict out = (uint* restrict)_out;
1638 (out+j)[0] = (uint)(i);
1639 (out+j)[1] = (uint)(i+1);
1647 uint * restrict out = (uint* restrict)_out;
1652 (out+j)[0] = (uint)(i);
1653 (out+j)[1] = (uint)(i+1);
1656 (out+j)[0] = (uint)(end);
1657 (out+j)[1] = (uint)(start);
1664 uint * restrict out = (uint* restrict)_out;
1668 (out+j)[0] = (uint)(i);
1669 (out+j)[1] = (uint)(i+1);
1670 (out+j)[2] = (uint)(i+2);
1678 uint * restrict out = (uint* restrict)_out;
1682 (out+j)[0] = (uint)(i+(i&1));
1683 (out+j)[1] = (uint)(i+1-(i&1));
1684 (out+j)[2] = (uint)(i+2);
1692 uint * restrict out = (uint* restrict)_out;
1696 (out+j)[0] = (uint)(start);
1697 (out+j)[1] = (uint)(i+1);
1698 (out+j)[2] = (uint)(i+2);
1706 uint * restrict out = (uint* restrict)_out;
1710 (out+j+0)[0] = (uint)(i+0);
1711 (out+j+0)[1] = (uint)(i+1);
1712 (out+j+0)[2] = (uint)(i+3);
1713 (out+j+3)[0] = (uint)(i+1);
1714 (out+j+3)[1] = (uint)(i+2);
1715 (out+j+3)[2] = (uint)(i+3);
1723 uint * restrict out = (uint* restrict)_out;
1727 (out+j+0)[0] = (uint)(i+2);
1728 (out+j+0)[1] = (uint)(i+0);
1729 (out+j+0)[2] = (uint)(i+3);
1730 (out+j+3)[0] = (uint)(i+0);
1731 (out+j+3)[1] = (uint)(i+1);
1732 (out+j+3)[2] = (uint)(i+3);
1740 uint * restrict out = (uint* restrict)_out;
1744 (out+j)[0] = (uint)(i+1);
1745 (out+j)[1] = (uint)(i+2);
1746 (out+j)[2] = (uint)(start);
1754 uint * restrict out = (uint* restrict)_out;
1758 (out+j)[0] = (uint)(i+0);
1759 (out+j)[1] = (uint)(i+1);
1760 (out+j)[2] = (uint)(i+2);
1761 (out+j)[3] = (uint)(i+3);
1769 uint * restrict out = (uint* restrict)_out;
1773 (out+j)[0] = (uint)(i+0);
1774 (out+j)[1] = (uint)(i+1);
1775 (out+j)[2] = (uint)(i+2);
1776 (out+j)[3] = (uint)(i+3);
1784 uint * restrict out = (uint* restrict)_out;
1788 (out+j)[0] = (uint)(i+0);
1789 (out+j)[1] = (uint)(i+1);
1790 (out+j)[2] = (uint)(i+2);
1791 (out+j)[3] = (uint)(i+3);
1792 (out+j)[4] = (uint)(i+4);
1793 (out+j)[5] = (uint)(i+5);
1801 uint * restrict out = (uint* restrict)_out;
1807 (out+j)[0] = (uint)(i+0);
1808 (out+j)[1] = (uint)(i+1);
1809 (out+j)[2] = (uint)(i+2);
1810 (out+j)[3] = (uint)(i+3);
1811 (out+j)[4] = (uint)(i+4);
1812 (out+j)[5] = (uint)(i+5);
1815 (out+j)[0] = (uint)(i+2);
1816 (out+j)[1] = (uint)(i-2);
1817 (out+j)[2] = (uint)(i+0);
1818 (out+j)[3] = (uint)(i+3);
1819 (out+j)[4] = (uint)(i+4);
1820 (out+j)[5] = (uint)(i+6);
1833 ushort * restrict out = (ushort* restrict)_out;
1837 (out+j)[0] = (ushort)in[i];
1849 ushort * restrict out = (ushort* restrict)_out;
1853 (out+j)[0] = (ushort)in[i];
1854 (out+j)[1] = (ushort)in[i+1];
1866 ushort * restrict out = (ushort* restrict)_out;
1870 (out+j)[0] = (ushort)in[i];
1871 (out+j)[1] = (ushort)in[i+1];
1883 ushort * restrict out = (ushort* restrict)_out;
1888 (out+j)[0] = (ushort)in[i];
1889 (out+j)[1] = (ushort)in[i+1];
1892 (out+j)[0] = (ushort)in[end];
1893 (out+j)[1] = (ushort)in[start];
1904 ushort * restrict out = (ushort* restrict)_out;
1908 (out+j)[0] = (ushort)in[i];
1909 (out+j)[1] = (ushort)in[i+1];
1910 (out+j)[2] = (ushort)in[i+2];
1922 ushort * restrict out = (ushort* restrict)_out;
1926 (out+j)[0] = (ushort)in[i];
1927 (out+j)[1] = (ushort)in[i+1+(i&1)];
1928 (out+j)[2] = (ushort)in[i+2-(i&1)];
1940 ushort * restrict out = (ushort* restrict)_out;
1944 (out+j)[0] = (ushort)in[i+1];
1945 (out+j)[1] = (ushort)in[i+2];
1946 (out+j)[2] = (ushort)in[start];
1958 ushort * restrict out = (ushort* restrict)_out;
1962 (out+j+0)[0] = (ushort)in[i+0];
1963 (out+j+0)[1] = (ushort)in[i+1];
1964 (out+j+0)[2] = (ushort)in[i+2];
1965 (out+j+3)[0] = (ushort)in[i+0];
1966 (out+j+3)[1] = (ushort)in[i+2];
1967 (out+j+3)[2] = (ushort)in[i+3];
1979 ushort * restrict out = (ushort* restrict)_out;
1983 (out+j+0)[0] = (ushort)in[i+0];
1984 (out+j+0)[1] = (ushort)in[i+1];
1985 (out+j+0)[2] = (ushort)in[i+3];
1986 (out+j+3)[0] = (ushort)in[i+0];
1987 (out+j+3)[1] = (ushort)in[i+3];
1988 (out+j+3)[2] = (ushort)in[i+2];
2000 ushort * restrict out = (ushort* restrict)_out;
2004 (out+j)[0] = (ushort)in[start];
2005 (out+j)[1] = (ushort)in[i+1];
2006 (out+j)[2] = (ushort)in[i+2];
2018 ushort * restrict out = (ushort* restrict)_out;
2022 (out+j)[0] = (ushort)in[i+0];
2023 (out+j)[1] = (ushort)in[i+1];
2024 (out+j)[2] = (ushort)in[i+2];
2025 (out+j)[3] = (ushort)in[i+3];
2037 ushort * restrict out = (ushort* restrict)_out;
2041 (out+j)[0] = (ushort)in[i+0];
2042 (out+j)[1] = (ushort)in[i+1];
2043 (out+j)[2] = (ushort)in[i+2];
2044 (out+j)[3] = (ushort)in[i+3];
2056 ushort * restrict out = (ushort* restrict)_out;
2060 (out+j)[0] = (ushort)in[i+0];
2061 (out+j)[1] = (ushort)in[i+1];
2062 (out+j)[2] = (ushort)in[i+2];
2063 (out+j)[3] = (ushort)in[i+3];
2064 (out+j)[4] = (ushort)in[i+4];
2065 (out+j)[5] = (ushort)in[i+5];
2077 ushort * restrict out = (ushort* restrict)_out;
2083 (out+j)[0] = (ushort)in[i+0];
2084 (out+j)[1] = (ushort)in[i+1];
2085 (out+j)[2] = (ushort)in[i+2];
2086 (out+j)[3] = (ushort)in[i+3];
2087 (out+j)[4] = (ushort)in[i+4];
2088 (out+j)[5] = (ushort)in[i+5];
2091 (out+j)[0] = (ushort)in[i+2];
2092 (out+j)[1] = (ushort)in[i-2];
2093 (out+j)[2] = (ushort)in[i+0];
2094 (out+j)[3] = (ushort)in[i+3];
2095 (out+j)[4] = (ushort)in[i+4];
2096 (out+j)[5] = (ushort)in[i+6];
2109 ushort * restrict out = (ushort* restrict)_out;
2113 (out+j)[0] = (ushort)in[i];
2125 ushort * restrict out = (ushort* restrict)_out;
2129 (out+j)[0] = (ushort)in[i];
2130 (out+j)[1] = (ushort)in[i+1];
2142 ushort * restrict out = (ushort* restrict)_out;
2146 (out+j)[0] = (ushort)in[i];
2147 (out+j)[1] = (ushort)in[i+1];
2159 ushort * restrict out = (ushort* restrict)_out;
2166 (out+j+0)[0] = restart_index;
2167 (out+j+0)[1] = restart_index;
2172 (out+j)[0] = (ushort)in[end];
2173 (out+j)[1] = (ushort)in[start];
2181 (out+j)[0] = (ushort)in[end];
2182 (out+j)[1] = (ushort)in[start];
2188 (out+j)[0] = (ushort)in[i];
2189 (out+j)[1] = (ushort)in[i+1];
2192 (out+j)[0] = (ushort)in[end];
2193 (out+j)[1] = (ushort)in[start];
2204 ushort * restrict out = (ushort* restrict)_out;
2208 (out+j)[0] = (ushort)in[i];
2209 (out+j)[1] = (ushort)in[i+1];
2210 (out+j)[2] = (ushort)in[i+2];
2222 ushort * restrict out = (ushort* restrict)_out;
2226 (out+j)[0] = (ushort)in[i];
2227 (out+j)[1] = (ushort)in[i+1+(i&1)];
2228 (out+j)[2] = (ushort)in[i+2-(i&1)];
2240 ushort * restrict out = (ushort* restrict)_out;
2246 (out+j+0)[0] = restart_index;
2247 (out+j+0)[1] = restart_index;
2248 (out+j+0)[2] = restart_index;
2266 (out+j)[0] = (ushort)in[i+1];
2267 (out+j)[1] = (ushort)in[i+2];
2268 (out+j)[2] = (ushort)in[start];
2280 ushort * restrict out = (ushort* restrict)_out;
2286 (out+j+0)[0] = restart_index;
2287 (out+j+0)[1] = restart_index;
2288 (out+j+0)[2] = restart_index;
2289 (out+j+3)[0] = restart_index;
2290 (out+j+3)[1] = restart_index;
2291 (out+j+3)[2] = restart_index;
2310 (out+j+0)[0] = (ushort)in[i+0];
2311 (out+j+0)[1] = (ushort)in[i+1];
2312 (out+j+0)[2] = (ushort)in[i+2];
2313 (out+j+3)[0] = (ushort)in[i+0];
2314 (out+j+3)[1] = (ushort)in[i+2];
2315 (out+j+3)[2] = (ushort)in[i+3];
2327 ushort * restrict out = (ushort* restrict)_out;
2333 (out+j+0)[0] = restart_index;
2334 (out+j+0)[1] = restart_index;
2335 (out+j+0)[2] = restart_index;
2336 (out+j+3)[0] = restart_index;
2337 (out+j+3)[1] = restart_index;
2338 (out+j+3)[2] = restart_index;
2357 (out+j+0)[0] = (ushort)in[i+0];
2358 (out+j+0)[1] = (ushort)in[i+1];
2359 (out+j+0)[2] = (ushort)in[i+3];
2360 (out+j+3)[0] = (ushort)in[i+0];
2361 (out+j+3)[1] = (ushort)in[i+3];
2362 (out+j+3)[2] = (ushort)in[i+2];
2374 ushort * restrict out = (ushort* restrict)_out;
2380 (out+j+0)[0] = restart_index;
2381 (out+j+0)[1] = restart_index;
2382 (out+j+0)[2] = restart_index;
2400 (out+j)[0] = (ushort)in[start];
2401 (out+j)[1] = (ushort)in[i+1];
2402 (out+j)[2] = (ushort)in[i+2];
2414 ushort * restrict out = (ushort* restrict)_out;
2418 (out+j)[0] = (ushort)in[i+0];
2419 (out+j)[1] = (ushort)in[i+1];
2420 (out+j)[2] = (ushort)in[i+2];
2421 (out+j)[3] = (ushort)in[i+3];
2433 ushort * restrict out = (ushort* restrict)_out;
2437 (out+j)[0] = (ushort)in[i+0];
2438 (out+j)[1] = (ushort)in[i+1];
2439 (out+j)[2] = (ushort)in[i+2];
2440 (out+j)[3] = (ushort)in[i+3];
2452 ushort * restrict out = (ushort* restrict)_out;
2456 (out+j)[0] = (ushort)in[i+0];
2457 (out+j)[1] = (ushort)in[i+1];
2458 (out+j)[2] = (ushort)in[i+2];
2459 (out+j)[3] = (ushort)in[i+3];
2460 (out+j)[4] = (ushort)in[i+4];
2461 (out+j)[5] = (ushort)in[i+5];
2473 ushort * restrict out = (ushort* restrict)_out;
2479 (out+j)[0] = (ushort)in[i+0];
2480 (out
2481 (out+j)[2] = (ushort)in[i+2];
2482 (out+j)[3] = (ushort)in[i+3];
2483 (out+j)[4] = (ushort)in[i+4];
2484 (out+j)[5] = (ushort)in[i+5];
2487 (out+j)[0] = (ushort)in[i+2];
2488 (out+j)[1] = (ushort)in[i-2];
2489 (out+j)[2] = (ushort)in[i+0];
2490 (out+j)[3] = (ushort)in[i+3];
2491 (out+j)[4] = (ushort)in[i+4];
2492 (out+j)[5] = (ushort)in[i+6];
2505 ushort * restrict out = (ushort* restrict)_out;
2509 (out+j)[0] = (ushort)in[i];
2521 ushort * restrict out = (ushort* restrict)_out;
2525 (out+j)[0] = (ushort)in[i+1];
2526 (out+j)[1] = (ushort)in[i];
2538 ushort * restrict out = (ushort* restrict)_out;
2542 (out+j)[0] = (ushort)in[i+1];
2543 (out+j)[1] = (ushort)in[i];
2555 ushort * restrict out = (ushort* restrict)_out;
2560 (out+j)[0] = (ushort)in[i+1];
2561 (out+j)[1] = (ushort)in[i];
2564 (out+j)[0] = (ushort)in[start];
2565 (out+j)[1] = (ushort)in[end];
2576 ushort * restrict out = (ushort* restrict)_out;
2580 (out+j)[0] = (ushort)in[i+1];
2581 (out+j)[1] = (ushort)in[i+2];
2582 (out+j)[2] = (ushort)in[i];
2594 ushort * restrict out = (ushort* restrict)_out;
2598 (out+j)[0] = (ushort)in[i+1+(i&1)];
2599 (out+j)[1] = (ushort)in[i+2-(i&1)];
2600 (out+j)[2] = (ushort)in[i];
2612 ushort * restrict out = (ushort* restrict)_out;
2616 (out+j)[0] = (ushort)in[i+2];
2617 (out+j)[1] = (ushort)in[start];
2618 (out+j)[2] = (ushort)in[i+1];
2630 ushort * restrict out = (ushort* restrict)_out;
2634 (out+j+0)[0] = (ushort)in[i+1];
2635 (out+j+0)[1] = (ushort)in[i+2];
2636 (out+j+0)[2] = (ushort)in[i+0];
2637 (out+j+3)[0] = (ushort)in[i+2];
2638 (out+j+3)[1] = (ushort)in[i+3];
2639 (out+j+3)[2] = (ushort)in[i+0];
2651 ushort * restrict out = (ushort* restrict)_out;
2655 (out+j+0)[0] = (ushort)in[i+1];
2656 (out+j+0)[1] = (ushort)in[i+3];
2657 (out+j+0)[2] = (ushort)in[i+0];
2658 (out+j+3)[0] = (ushort)in[i+3];
2659 (out+j+3)[1] = (ushort)in[i+2];
2660 (out+j+3)[2] = (ushort)in[i+0];
2672 ushort * restrict out = (ushort* restrict)_out;
2676 (out+j)[0] = (ushort)in[i+1];
2677 (out+j)[1] = (ushort)in[i+2];
2678 (out+j)[2] = (ushort)in[start];
2690 ushort * restrict out = (ushort* restrict)_out;
2694 (out+j)[0] = (ushort)in[i+3];
2695 (out+j)[1] = (ushort)in[i+2];
2696 (out+j)[2] = (ushort)in[i+1];
2697 (out+j)[3] = (ushort)in[i+0];
2709 ushort * restrict out = (ushort* restrict)_out;
2713 (out+j)[0] = (ushort)in[i+3];
2714 (out+j)[1] = (ushort)in[i+2];
2715 (out+j)[2] = (ushort)in[i+1];
2716 (out+j)[3] = (ushort)in[i+0];
2728 ushort * restrict out = (ushort* restrict)_out;
2732 (out+j)[0] = (ushort)in[i+4];
2733 (out+j)[1] = (ushort)in[i+5];
2734 (out+j)[2] = (ushort)in[i+0];
2735 (out+j)[3] = (ushort)in[i+1];
2736 (out+j)[4] = (ushort)in[i+2];
2737 (out+j)[5] = (ushort)in[i+3];
2749 ushort * restrict out = (ushort* restrict)_out;
2755 (out+j)[0] = (ushort)in[i+4];
2756 (out+j)[1] = (ushort)in[i+5];
2757 (out+j)[2] = (ushort)in[i+0];
2758 (out+j)[3] = (ushort)in[i+1];
2759 (out+j)[4] = (ushort)in[i+2];
2760 (out+j)[5] = (ushort)in[i+3];
2763 (out+j)[0] = (ushort)in[i+4];
2764 (out+j)[1] = (ushort)in[i+6];
2765 (out+j)[2] = (ushort)in[i+2];
2766 (out+j)[3] = (ushort)in[i-2];
2767 (out+j)[4] = (ushort)in[i+0];
2768 (out+j)[5] = (ushort)in[i+3];
2781 ushort * restrict out = (ushort* restrict)_out;
2785 (out+j)[0] = (ushort)in[i];
2797 ushort * restrict out = (ushort* restrict)_out;
2801 (out+j)[0] = (ushort)in[i+1];
2802 (out+j)[1] = (ushort)in[i];
2814 ushort * restrict out = (ushort* restrict)_out;
2818 (out+j)[0] = (ushort)in[i+1];
2819 (out+j)[1] = (ushort)in[i];
2831 ushort * restrict out = (ushort* restrict)_out;
2838 (out+j+0)[0] = restart_index;
2839 (out+j+0)[1] = restart_index;
2844 (out+j)[0] = (ushort)in[start];
2845 (out+j)[1] = (ushort)in[end];
2853 (out+j)[0] = (ushort)in[start];
2854 (out+j)[1] = (ushort)in[end];
2860 (out+j)[0] = (ushort)in[i+1];
2861 (out+j)[1] = (ushort)in[i];
2864 (out+j)[0] = (ushort)in[start];
2865 (out+j)[1] = (ushort)in[end];
2876 ushort * restrict out = (ushort* restrict)_out;
2880 (out+j)[0] = (ushort)in[i+1];
2881 (out+j)[1] = (ushort)in[i+2];
2882 (out+j)[2] = (ushort)in[i];
2894 ushort * restrict out = (ushort* restrict)_out;
2898 (out+j)[0] = (ushort)in[i+1+(i&1)];
2899 (out+j)[1] = (ushort)in[i+2-(i&1)];
2900 (out+j)[2] = (ushort)in[i];
2912 ushort * restrict out = (ushort* restrict)_out;
2918 (out+j+0)[0] = restart_index;
2919 (out+j+0)[1] = restart_index;
2920 (out+j+0)[2] = restart_index;
2938 (out+j)[0] = (ushort)in[i+2];
2939 (out+j)[1] = (ushort)in[start];
2940 (out+j)[2] = (ushort)in[i+1];
2952 ushort * restrict out = (ushort* restrict)_out;
2958 (out+j+0)[0] = restart_index;
2959 (out+j+0)[1] = restart_index;
2960 (out+j+0)[2] = restart_index;
2961 (out+j+3)[0] = restart_index;
2962 (out+j+3)[1] = restart_index;
2963 (out+j+3)[2] = restart_index;
2982 (out+j+0)[0] = (ushort)in[i+1];
2983 (out+j+0)[1] = (ushort)in[i+2];
2984 (out+j+0)[2] = (ushort)in[i+0];
2985 (out+j+3)[0] = (ushort)in[i+2];
2986 (out+j+3)[1] = (ushort)in[i+3];
2987 (out+j+3)[2] = (ushort)in[i+0];
2999 ushort * restrict out = (ushort* restrict)_out;
3005 (out+j+0)[0] = restart_index;
3006 (out+j+0)[1] = restart_index;
3007 (out+j+0)[2] = restart_index;
3008 (out+j+3)[0] = restart_index;
3009 (out+j+3)[1] = restart_index;
3010 (out+j+3)[2] = restart_index;
3029 (out+j+0)[0] = (ushort)in[i+1];
3030 (out+j+0)[1] = (ushort)in[i+3];
3031 (out+j+0)[2] = (ushort)in[i+0];
3032 (out+j+3)[0] = (ushort)in[i+3];
3033 (out+j+3)[1] = (ushort)in[i+2];
3034 (out+j+3)[2] = (ushort)in[i+0];
3046 ushort * restrict out = (ushort* restrict)_out;
3052 (out+j+0)[0] = restart_index;
3053 (out+j+0)[1] = restart_index;
3054 (out+j+0)[2] = restart_index;
3072 (out+j)[0] = (ushort)in[i+1];
3073 (out+j)[1] = (ushort)in[i+2];
3074 (out+j)[2] = (ushort)in[start];
3086 ushort * restrict out = (ushort* restrict)_out;
3090 (out+j)[0] = (ushort)in[i+3];
3091 (out+j)[1] = (ushort)in[i+2];
3092 (out+j)[2] = (ushort)in[i+1];
3093 (out+j)[3] = (ushort)in[i+0];
3105 ushort * restrict out = (ushort* restrict)_out;
3109 (out+j)[0] = (ushort)in[i+3];
3110 (out+j)[1] = (ushort)in[i+2];
3111 (out+j)[2] = (ushort)in[i+1];
3112 (out+j)[3] = (ushort)in[i+0];
3124 ushort * restrict out = (ushort* restrict)_out;
3128 (out+j)[0] = (ushort)in[i+4];
3129 (out+j)[1] = (ushort)in[i+5];
3130 (out+j)[2] = (ushort)in[i+0];
3131 (out+j)[3] = (ushort)in[i+1];
3132 (out+j)[4] = (ushort)in[i+2];
3133 (out+j)[5] = (ushort)in[i+3];
3145 ushort * restrict out = (ushort* restrict)_out;
3151 (out+j)[0] = (ushort)in[i+4];
3152 (out+j)[1] = (ushort)in[i+5];
3153 (out+j)[2] = (ushort)in[i+0];
3154 (out+j)[3] = (ushort)in[i+1];
3155 (out+j)[4] = (ushort)in[i+2];
3156 (out+j)[5] = (ushort)in[i+3];
3159 (out+j)[0] = (ushort)in[i+4];
3160 (out+j)[1] = (ushort)in[i+6];
3161 (out+j)[2] = (ushort)in[i+2];
3162 (out+j)[3] = (ushort)in[i-2];
3163 (out+j)[4] = (ushort)in[i+0];
3164 (out+j)[5] = (ushort)in[i+3];
3177 ushort * restrict out = (ushort* restrict)_out;
3181 (out+j)[0] = (ushort)in[i];
3193 ushort * restrict out = (ushort* restrict)_out;
3197 (out+j)[0] = (ushort)in[i+1];
3198 (out+j)[1] = (ushort)in[i];
3210 ushort * restrict out = (ushort* restrict)_out;
3214 (out+j)[0] = (ushort)in[i+1];
3215 (out+j)[1] = (ushort)in[i];
3227 ushort * restrict out = (ushort* restrict)_out;
3232 (out+j)[0] = (ushort)in[i+1];
3233 (out+j)[1] = (ushort)in[i];
3236 (out+j)[0] = (ushort)in[start];
3237 (out+j)[1] = (ushort)in[end];
3248 ushort * restrict out = (ushort* restrict)_out;
3252 (out+j)[0] = (ushort)in[i+2];
3253 (out+j)[1] = (ushort)in[i];
3254 (out+j)[2] = (ushort)in[i+1];
3266 ushort * restrict out = (ushort* restrict)_out;
3270 (out+j)[0] = (ushort)in[i+2];
3271 (out+j)[1] = (ushort)in[i+(i&1)];
3272 (out+j)[2] = (ushort)in[i+1-(i&1)];
3284 ushort * restrict out = (ushort* restrict)_out;
3288 (out+j)[0] = (ushort)in[i+2];
3289 (out+j)[1] = (ushort)in[start];
3290 (out+j)[2] = (ushort)in[i+1];
3302 ushort * restrict out = (ushort* restrict)_out;
3306 (out+j+0)[0] = (ushort)in[i+3];
3307 (out+j+0)[1] = (ushort)in[i+0];
3308 (out+j+0)[2] = (ushort)in[i+1];
3309 (out+j+3)[0] = (ushort)in[i+3];
3310 (out+j+3)[1] = (ushort)in[i+1];
3311 (out+j+3)[2] = (ushort)in[i+2];
3323 ushort * restrict out = (ushort* restrict)_out;
3327 (out+j+0)[0] = (ushort)in[i+3];
3328 (out+j+0)[1] = (ushort)in[i+2];
3329 (out+j+0)[2] = (ushort)in[i+0];
3330 (out+j+3)[0] = (ushort)in[i+3];
3331 (out+j+3)[1] = (ushort)in[i+0];
3332 (out+j+3)[2] = (ushort)in[i+1];
3344 ushort * restrict out = (ushort* restrict)_out;
3348 (out+j)[0] = (ushort)in[start];
3349 (out+j)[1] = (ushort)in[i+1];
3350 (out+j)[2] = (ushort)in[i+2];
3362 ushort * restrict out = (ushort* restrict)_out;
3366 (out+j)[0] = (ushort)in[i+3];
3367 (out+j)[1] = (ushort)in[i+2];
3368 (out+j)[2] = (ushort)in[i+1];
3369 (out+j)[3] = (ushort)in[i+0];
3381 ushort * restrict out = (ushort* restrict)_out;
3385 (out+j)[0] = (ushort)in[i+3];
3386 (out+j)[1] = (ushort)in[i+2];
3387 (out+j)[2] = (ushort)in[i+1];
3388 (out+j)[3] = (ushort)in[i+0];
3400 ushort * restrict out = (ushort* restrict)_out;
3404 (out+j)[0] = (ushort)in[i+4];
3405 (out+j)[1] = (ushort)in[i+5];
3406 (out+j)[2] = (ushort)in[i+0];
3407 (out+j)[3] = (ushort)in[i+1];
3408 (out+j)[4] = (ushort)in[i+2];
3409 (out+j)[5] = (ushort)in[i+3];
3421 ushort * restrict out = (ushort* restrict)_out;
3427 (out+j)[0] = (ushort)in[i+4];
3428 (out+j)[1] = (ushort)in[i+5];
3429 (out+j)[2] = (ushort)in[i+0];
3430 (out+j)[3] = (ushort)in[i+1];
3431 (out+j)[4] = (ushort)in[i+2];
3432 (out+j)[5] = (ushort)in[i+3];
3435 (out+j)[0] = (ushort)in[i+4];
3436 (out+j)[1] = (ushort)in[i+6];
3437 (out+j)[2] = (ushort)in[i+2];
3438 (out+j)[3] = (ushort)in[i-2];
3439 (out+j)[4] = (ushort)in[i+0];
3440 (out+j)[5] = (ushort)in[i+3];
3453 ushort * restrict out = (ushort* restrict)_out;
3457 (out+j)[0] = (ushort)in[i];
3469 ushort * restrict out = (ushort* restrict)_out;
3473 (out+j)[0] = (ushort)in[i+1];
3474 (out+j)[1] = (ushort)in[i];
3486 ushort * restrict out = (ushort* restrict)_out;
3490 (out+j)[0] = (ushort)in[i+1];
3491 (out+j)[1] = (ushort)in[i];
3503 ushort * restrict out = (ushort* restrict)_out;
3510 (out+j+0)[0] = restart_index;
3511 (out+j+0)[1] = restart_index;
3516 (out+j)[0] = (ushort)in[start];
3517 (out+j)[1] = (ushort)in[end];
3525 (out+j)[0] = (ushort)in[start];
3526 (out+j)[1] = (ushort)in[end];
3532 (out+j)[0] = (ushort)in[i+1];
3533 (out+j)[1] = (ushort)in[i];
3536 (out+j)[0] = (ushort)in[start];
3537 (out+j)[1] = (ushort)in[end];
3548 ushort * restrict out = (ushort* restrict)_out;
3552 (out+j)[0] = (ushort)in[i+2];
3553 (out+j)[1] = (ushort)in[i];
3554 (out+j)[2] = (ushort)in[i+1];
3566 ushort * restrict out = (ushort* restrict)_out;
3570 (out+j)[0] = (ushort)in[i+2];
3571 (out+j)[1] = (ushort)in[i+(i&1)];
3572 (out+j)[2] = (ushort)in[i+1-(i&1)];
3584 ushort * restrict out = (ushort* restrict)_out;
3590 (out+j+0)[0] = restart_index;
3591 (out+j+0)[1] = restart_index;
3592 (out+j+0)[2] = restart_index;
3610 (out+j)[0] = (ushort)in[i+2];
3611 (out+j)[1] = (ushort)in[start];
3612 (out+j)[2] = (ushort)in[i+1];
3624 ushort * restrict out = (ushort* restrict)_out;
3630 (out+j+0)[0] = restart_index;
3631 (out+j+0)[1] = restart_index;
3632 (out+j+0)[2] = restart_index;
3633 (out+j+3)[0] = restart_index;
3634 (out+j+3)[1] = restart_index;
3635 (out+j+3)[2] = restart_index;
3654 (out+j+0)[0] = (ushort)in[i+3];
3655 (out+j+0)[1] = (ushort)in[i+0];
3656 (out+j+0)[2] = (ushort)in[i+1];
3657 (out+j+3)[0] = (ushort)in[i+3];
3658 (out+j+3)[1] = (ushort)in[i+1];
3659 (out+j+3)[2] = (ushort)in[i+2];
3671 ushort * restrict out = (ushort* restrict)_out;
3677 (out+j+0)[0] = restart_index;
3678 (out+j+0)[1] = restart_index;
3679 (out+j+0)[2] = restart_index;
3680 (out+j+3)[0] = restart_index;
3681 (out+j+3)[1] = restart_index;
3682 (out+j+3)[2] = restart_index;
3701 (out+j+0)[0] = (ushort)in[i+3];
3702 (out+j+0)[1] = (ushort)in[i+2];
3703 (out+j+0)[2] = (ushort)in[i+0];
3704 (out+j+3)[0] = (ushort)in[i+3];
3705 (out+j+3)[1] = (ushort)in[i+0];
3706 (out+j+3)[2] = (ushort)in[i+1];
3718 ushort * restrict out = (ushort* restrict)_out;
3724 (out+j+0)[0] = restart_index;
3725 (out+j+0)[1] = restart_index;
3726 (out+j+0)[2] = restart_index;
3744 (out+j)[0] = (ushort)in[start];
3745 (out+j)[1] = (ushort)in[i+1];
3746 (out+j)[2] = (ushort)in[i+2];
3758 ushort * restrict out = (ushort* restrict)_out;
3762 (out+j)[0] = (ushort)in[i+3];
3763 (out+j)[1] = (ushort)in[i+2];
3764 (out+j)[2] = (ushort)in[i+1];
3765 (out+j)[3] = (ushort)in[i+0];
3777 ushort * restrict out = (ushort* restrict)_out;
3781 (out+j)[0] = (ushort)in[i+3];
3782 (out+j)[1] = (ushort)in[i+2];
3783 (out+j)[2] = (ushort)in[i+1];
3784 (out+j)[3] = (ushort)in[i+0];
3796 ushort * restrict out = (ushort* restrict)_out;
3800 (out+j)[0] = (ushort)in[i+4];
3801 (out+j)[1] = (ushort)in[i+5];
3802 (out+j)[2] = (ushort)in[i+0];
3803 (out+j)[3] = (ushort)in[i+1];
3804 (out+j)[4] = (ushort)in[i+2];
3805 (out+j)[5] = (ushort)in[i+3];
3817 ushort * restrict out = (ushort* restrict)_out;
3823 (out+j)[0] = (ushort)in[i+4];
3824 (out+j)[1] = (ushort)in[i+5];
3825 (out+j)[2] = (ushort)in[i+0];
3826 (out+j)[3] = (ushort)in[i+1];
3827 (out+j)[4] = (ushort)in[i+2];
3828 (out+j)[5] = (ushort)in[i+3];
3831 (out+j)[0] = (ushort)in[i+4];
3832 (out+j)[1] = (ushort)in[i+6];
3833 (out+j)[2] = (ushort)in[i+2];
3834 (out+j)[3] = (ushort)in[i-2];
3835 (out+j)[4] = (ushort)in[i+0];
3836 (out+j)[5] = (ushort)in[i+3];
3849 ushort * restrict out = (ushort* restrict)_out;
3853 (out+j)[0] = (ushort)in[i];
3865 ushort * restrict out = (ushort* restrict)_out;
3869 (out+j)[0] = (ushort)in[i];
3870 (out+j)[1] = (ushort)in[i+1];
3882 ushort * restrict out = (ushort* restrict)_out;
3886 (out+j)[0] = (ushort)in[i];
3887 (out+j)[1] = (ushort)in[i+1];
3899 ushort * restrict out = (ushort* restrict)_out;
3904 (out+j)[0] = (ushort)in[i];
3905 (out+j)[1] = (ushort)in[i+1];
3908 (out+j)[0] = (ushort)in[end];
3909 (out+j)[1] = (ushort)in[start];
3920 ushort * restrict out = (ushort* restrict)_out;
3924 (out+j)[0] = (ushort)in[i];
3925 (out+j)[1] = (ushort)in[i+1];
3926 (out+j)[2] = (ushort)in[i+2];
3938 ushort * restrict out = (ushort* restrict)_out;
3942 (out+j)[0] = (ushort)in[i+(i&1)];
3943 (out+j)[1] = (ushort)in[i+1-(i&1)];
3944 (out+j)[2] = (ushort)in[i+2];
3956 ushort * restrict out = (ushort* restrict)_out;
3960 (out+j)[0] = (ushort)in[start];
3961 (out+j)[1] = (ushort)in[i+1];
3962 (out+j)[2] = (ushort)in[i+2];
3974 ushort * restrict out = (ushort* restrict)_out;
3978 (out+j+0)[0] = (ushort)in[i+0];
3979 (out+j+0)[1] = (ushort)in[i+1];
3980 (out+j+0)[2] = (ushort)in[i+3];
3981 (out+j+3)[0] = (ushort)in[i+1];
3982 (out+j+3)[1] = (ushort)in[i+2];
3983 (out+j+3)[2] = (ushort)in[i+3];
3995 ushort * restrict out = (ushort* restrict)_out;
3999 (out+j+0)[0] = (ushort)in[i+2];
4000 (out+j+0)[1] = (ushort)in[i+0];
4001 (out+j+0)[2] = (ushort)in[i+3];
4002 (out+j+3)[0] = (ushort)in[i+0];
4003 (out+j+3)[1] = (ushort)in[i+1];
4004 (out+j+3)[2] = (ushort)in[i+3];
4016 ushort * restrict out = (ushort* restrict)_out;
4020 (out+j)[0] = (ushort)in[i+1];
4021 (out+j)[1] = (ushort)in[i+2];
4022 (out+j)[2] = (ushort)in[start];
4034 ushort * restrict out = (ushort* restrict)_out;
4038 (out+j)[0] = (ushort)in[i+0];
4039 (out+j)[1] = (ushort)in[i+1];
4040 (out+j)[2] = (ushort)in[i+2];
4041 (out+j)[3] = (ushort)in[i+3];
4053 ushort * restrict out = (ushort* restrict)_out;
4057 (out+j)[0] = (ushort)in[i+0];
4058 (out+j)[1] = (ushort)in[i+1];
4059 (out+j)[2] = (ushort)in[i+2];
4060 (out+j)[3] = (ushort)in[i+3];
4072 ushort * restrict out = (ushort* restrict)_out;
4076 (out+j)[0] = (ushort)in[i+0];
4077 (out+j)[1] = (ushort)in[i+1];
4078 (out+j)[2] = (ushort)in[i+2];
4079 (out+j)[3] = (ushort)in[i+3];
4080 (out+j)[4] = (ushort)in[i+4];
4081 (out+j)[5] = (ushort)in[i+5];
4093 ushort * restrict out = (ushort* restrict)_out;
4099 (out+j)[0] = (ushort)in[i+0];
4100 (out+j)[1] = (ushort)in[i+1];
4101 (out+j)[2] = (ushort)in[i+2];
4102 (out+j)[3] = (ushort)in[i+3];
4103 (out+j)[4] = (ushort)in[i+4];
4104 (out+j)[5] = (ushort)in[i+5];
4107 (out+j)[0] = (ushort)in[i+2];
4108 (out+j)[1] = (ushort)in[i-2];
4109 (out+j)[2] = (ushort)in[i+0];
4110 (out+j)[3] = (ushort)in[i+3];
4111 (out+j)[4] = (ushort)in[i+4];
4112 (out+j)[5] = (ushort)in[i+6];
4125 ushort * restrict out = (ushort* restrict)_out;
4129 (out+j)[0] = (ushort)in[i];
4141 ushort * restrict out = (ushort* restrict)_out;
4145 (out+j)[0] = (ushort)in[i];
4146 (out+j)[1] = (ushort)in[i+1];
4158 ushort * restrict out = (ushort* restrict)_out;
4162 (out+j)[0] = (ushort)in[i];
4163 (out+j)[1] = (ushort)in[i+1];
4175 ushort * restrict out = (ushort* restrict)_out;
4182 (out+j+0)[0] = restart_index;
4183 (out+j+0)[1] = restart_index;
4188 (out+j)[0] = (ushort)in[end];
4189 (out+j)[1] = (ushort)in[start];
4197 (out+j)[0] = (ushort)in[end];
4198 (out+j)[1] = (ushort)in[start];
4204 (out+j)[0] = (ushort)in[i];
4205 (out+j)[1] = (ushort)in[i+1];
4208 (out+j)[0] = (ushort)in[end];
4209 (out+j)[1] = (ushort)in[start];
4220 ushort * restrict out = (ushort* restrict)_out;
4224 (out+j)[0] = (ushort)in[i];
4225 out+j)[1] = (ushort)in[i+1];
4226 (out+j)[2] = (ushort)in[i+2];
4238 ushort * restrict out = (ushort* restrict)_out;
4242 (out+j)[0] = (ushort)in[i+(i&1)];
4243 (out+j)[1] = (ushort)in[i+1-(i&1)];
4244 (out+j)[2] = (ushort)in[i+2];
4256 ushort * restrict out = (ushort* restrict)_out;
4262 (out+j+0)[0] = restart_index;
4263 (out+j+0)[1] = restart_index;
4264 (out+j+0)[2] = restart_index;
4282 (out+j)[0] = (ushort)in[start];
4283 (out+j)[1] = (ushort)in[i+1];
4284 (out+j)[2] = (ushort)in[i+2];
4296 ushort * restrict out = (ushort* restrict)_out;
4302 (out+j+0)[0] = restart_index;
4303 (out+j+0)[1] = restart_index;
4304 (out+j+0)[2] = restart_index;
4305 (out+j+3)[0] = restart_index;
4306 (out+j+3)[1] = restart_index;
4307 (out+j+3)[2] = restart_index;
4326 (out+j+0)[0] = (ushort)in[i+0];
4327 (out+j+0)[1] = (ushort)in[i+1];
4328 (out+j+0)[2] = (ushort)in[i+3];
4329 (out+j+3)[0] = (ushort)in[i+1];
4330 (out+j+3)[1] = (ushort)in[i+2];
4331 (out+j+3)[2] = (ushort)in[i+3];
4343 ushort * restrict out = (ushort* restrict)_out;
4349 (out+j+0)[0] = restart_index;
4350 (out+j+0)[1] = restart_index;
4351 (out+j+0)[2] = restart_index;
4352 (out+j+3)[0] = restart_index;
4353 (out+j+3)[1] = restart_index;
4354 (out+j+3)[2] = restart_index;
4373 (out+j+0)[0] = (ushort)in[i+2];
4374 (out+j+0)[1] = (ushort)in[i+0];
4375 (out+j+0)[2] = (ushort)in[i+3];
4376 (out+j+3)[0] = (ushort)in[i+0];
4377 (out+j+3)[1] = (ushort)in[i+1];
4378 (out+j+3)[2] = (ushort)in[i+3];
4390 ushort * restrict out = (ushort* restrict)_out;
4396 (out+j+0)[0] = restart_index;
4397 (out+j+0)[1] = restart_index;
4398 (out+j+0)[2] = restart_index;
4416 (out+j)[0] = (ushort)in[i+1];
4417 (out+j)[1] = (ushort)in[i+2];
4418 (out+j)[2] = (ushort)in[start];
4430 ushort * restrict out = (ushort* restrict)_out;
4434 (out+j)[0] = (ushort)in[i+0];
4435 (out+j)[1] = (ushort)in[i+1];
4436 (out+j)[2] = (ushort)in[i+2];
4437 (out+j)[3] = (ushort)in[i+3];
4449 ushort * restrict out = (ushort* restrict)_out;
4453 (out+j)[0] = (ushort)in[i+0];
4454 (out+j)[1] = (ushort)in[i+1];
4455 (out+j)[2] = (ushort)in[i+2];
4456 (out+j)[3] = (ushort)in[i+3];
4468 ushort * restrict out = (ushort* restrict)_out;
4472 (out+j)[0] = (ushort)in[i+0];
4473 (out+j)[1] = (ushort)in[i+1];
4474 (out+j)[2] = (ushort)in[i+2];
4475 (out+j)[3] = (ushort)in[i+3];
4476 (out+j)[4] = (ushort)in[i+4];
4477 (out+j)[5] = (ushort)in[i+5];
4489 ushort * restrict out = (ushort* restrict)_out;
4495 (out+j)[0] = (ushort)in[i+0];
4496 (out+j)[1] = (ushort)in[i+1];
4497 (out+j)[2] = (ushort)in[i+2];
4498 (out+j)[3] = (ushort)in[i+3];
4499 (out+j)[4] = (ushort)in[i+4];
4500 (out+j)[5] = (ushort)in[i+5];
4503 (out+j)[0] = (ushort)in[i+2];
4504 (out+j)[1] = (ushort)in[i-2];
4505 (out+j)[2] = (ushort)in[i+0];
4506 (out+j)[3] = (ushort)in[i+3];
4507 (out+j)[4] = (ushort)in[i+4];
4508 (out+j)[5] = (ushort)in[i+6];
4521 uint * restrict out = (uint* restrict)_out;
4525 (out+j)[0] = (uint)in[i];
4537 uint * restrict out = (uint* restrict)_out;
4541 (out+j)[0] = (uint)in[i];
4542 (out+j)[1] = (uint)in[i+1];
4554 uint * restrict out = (uint* restrict)_out;
4558 (out+j)[0] = (uint)in[i];
4559 (out+j)[1] = (uint)in[i+1];
4571 uint * restrict out = (uint* restrict)_out;
4576 (out+j)[0] = (uint)in[i];
4577 (out+j)[1] = (uint)in[i+1];
4580 (out+j)[0] = (uint)in[end];
4581 (out+j)[1] = (uint)in[start];
4592 uint * restrict out = (uint* restrict)_out;
4596 (out+j)[0] = (uint)in[i];
4597 (out+j)[1] = (uint)in[i+1];
4598 (out+j)[2] = (uint)in[i+2];
4610 uint * restrict out = (uint* restrict)_out;
4614 (out+j)[0] = (uint)in[i];
4615 (out+j)[1] = (uint)in[i+1+(i&1)];
4616 (out+j)[2] = (uint)in[i+2-(i&1)];
4628 uint * restrict out = (uint* restrict)_out;
4632 (out+j)[0] = (uint)in[i+1];
4633 (out+j)[1] = (uint)in[i+2];
4634 (out+j)[2] = (uint)in[start];
4646 uint * restrict out = (uint* restrict)_out;
4650 (out+j+0)[0] = (uint)in[i+0];
4651 (out+j+0)[1] = (uint)in[i+1];
4652 (out+j+0)[2] = (uint)in[i+2];
4653 (out+j+3)[0] = (uint)in[i+0];
4654 (out+j+3)[1] = (uint)in[i+2];
4655 (out+j+3)[2] = (uint)in[i+3];
4667 uint * restrict out = (uint* restrict)_out;
4671 (out+j+0)[0] = (uint)in[i+0];
4672 (out+j+0)[1] = (uint)in[i+1];
4673 (out+j+0)[2] = (uint)in[i+3];
4674 (out+j+3)[0] = (uint)in[i+0];
4675 (out+j+3)[1] = (uint)in[i+3];
4676 (out+j+3)[2] = (uint)in[i+2];
4688 uint * restrict out = (uint* restrict)_out;
4692 (out+j)[0] = (uint)in[start];
4693 (out+j)[1] = (uint)in[i+1];
4694 (out+j)[2] = (uint)in[i+2];
4706 uint * restrict out = (uint* restrict)_out;
4710 (out+j)[0] = (uint)in[i+0];
4711 (out+j)[1] = (uint)in[i+1];
4712 (out+j)[2] = (uint)in[i+2];
4713 (out+j)[3] = (uint)in[i+3];
4725 uint * restrict out = (uint* restrict)_out;
4729 (out+j)[0] = (uint)in[i+0];
4730 (out+j)[1] = (uint)in[i+1];
4731 (out+j)[2] = (uint)in[i+2];
4732 (out+j)[3] = (uint)in[i+3];
4744 uint * restrict out = (uint* restrict)_out;
4748 (out+j)[0] = (uint)in[i+0];
4749 (out+j)[1] = (uint)in[i+1];
4750 (out+j)[2] = (uint)in[i+2];
4751 (out+j)[3] = (uint)in[i+3];
4752 (out+j)[4] = (uint)in[i+4];
4753 (out+j)[5] = (uint)in[i+5];
4765 uint * restrict out = (uint* restrict)_out;
4771 (out+j)[0] = (uint)in[i+0];
4772 (out+j)[1] = (uint)in[i+1];
4773 (out+j)[2] = (uint)in[i+2];
4774 (out+j)[3] = (uint)in[i+3];
4775 (out+j)[4] = (uint)in[i+4];
4776 (out+j)[5] = (uint)in[i+5];
4779 (out+j)[0] = (uint)in[i+2];
4780 (out+j)[1] = (uint)in[i-2];
4781 (out+j)[2] = (uint)in[i+0];
4782 (out+j)[3] = (uint)in[i+3];
4783 (out+j)[4] = (uint)in[i+4];
4784 (out+j)[5] = (uint)in[i+6];
4797 uint * restrict out = (uint* restrict)_out;
4801 (out+j)[0] = (uint)in[i];
4813 out = (uint* restrict)_out;
4817 (out+j)[0] = (uint)in[i];
4818 (out+j)[1] = (uint)in[i+1];
4830 uint * restrict out = (uint* restrict)_out;
4834 (out+j)[0] = (uint)in[i];
4835 (out+j)[1] = (uint)in[i+1];
4847 uint * restrict out = (uint* restrict)_out;
4854 (out+j+0)[0] = restart_index;
4855 (out+j+0)[1] = restart_index;
4860 (out+j)[0] = (uint)in[end];
4861 (out+j)[1] = (uint)in[start];
4869 (out+j)[0] = (uint)in[end];
4870 (out+j)[1] = (uint)in[start];
4876 (out+j)[0] = (uint)in[i];
4877 (out+j)[1] = (uint)in[i+1];
4880 (out+j)[0] = (uint)in[end];
4881 (out+j)[1] = (uint)in[start];
4892 uint * restrict out = (uint* restrict)_out;
4896 (out+j)[0] = (uint)in[i];
4897 (out+j)[1] = (uint)in[i+1];
4898 (out+j)[2] = (uint)in[i+2];
4910 uint * restrict out = (uint* restrict)_out;
4914 (out+j)[0] = (uint)in[i];
4915 (out+j)[1] = (uint)in[i+1+(i&1)];
4916 (out+j)[2] = (uint)in[i+2-(i&1)];
4928 uint * restrict out = (uint* restrict)_out;
4934 (out+j+0)[0] = restart_index;
4935 (out+j+0)[1] = restart_index;
4936 (out+j+0)[2] = restart_index;
4954 (out+j)[0] = (uint)in[i+1];
4955 (out+j)[1] = (uint)in[i+2];
4956 (out+j)[2] = (uint)in[start];
4968 uint * restrict out = (uint* restrict)_out;
4974 (out+j+0)[0] = restart_index;
4975 (out+j+0)[1] = restart_index;
4976 (out+j+0)[2] = restart_index;
4977 (out+j+3)[0] = restart_index;
4978 (out+j+3)[1] = restart_index;
4979 (out+j+3)[2] = restart_index;
4998 (out+j+0)[0] = (uint)in[i+0];
4999 (out+j+0)[1] = (uint)in[i+1];
5000 (out+j+0)[2] = (uint)in[i+2];
5001 (out+j+3)[0] = (uint)in[i+0];
5002 (out+j+3)[1] = (uint)in[i+2];
5003 (out+j+3)[2] = (uint)in[i+3];
5015 uint * restrict out = (uint* restrict)_out;
5021 (out+j+0)[0] = restart_index;
5022 (out+j+0)[1] = restart_index;
5023 (out+j+0)[2] = restart_index;
5024 (out+j+3)[0] = restart_index;
5025 (out+j+3)[1] = restart_index;
5026 (out+j+3)[2] = restart_index;
5045 (out+j+0)[0] = (uint)in[i+0];
5046 (out+j+0)[1] = (uint)in[i+1];
5047 (out+j+0)[2] = (uint)in[i+3];
5048 (out+j+3)[0] = (uint)in[i+0];
5049 (out+j+3)[1] = (uint)in[i+3];
5050 (out+j+3)[2] = (uint)in[i+2];
5062 uint * restrict out = (uint* restrict)_out;
5068 (out+j+0)[0] = restart_index;
5069 (out+j+0)[1] = restart_index;
5070 (out+j+0)[2] = restart_index;
5088 (out+j)[0] = (uint)in[start];
5089 (out+j)[1] = (uint)in[i+1];
5090 (out+j)[2] = (uint)in[i+2];
5102 uint * restrict out = (uint* restrict)_out;
5106 (out+j)[0] = (uint)in[i+0];
5107 (out+j)[1] = (uint)in[i+1];
5108 (out+j)[2] = (uint)in[i+2];
5109 (out+j)[3] = (uint)in[i+3];
5121 uint * restrict out = (uint* restrict)_out;
5125 (out+j)[0] = (uint)in[i+0];
5126 (out+j)[1] = (uint)in[i+1];
5127 (out+j)[2] = (uint)in[i+2];
5128 (out+j)[3] = (uint)in[i+3];
5140 uint * restrict out = (uint* restrict)_out;
5144 (out+j)[0] = (uint)in[i+0];
5145 (out+j)[1] = (uint)in[i+1];
5146 (out+j)[2] = (uint)in[i+2];
5147 (out+j)[3] = (uint)in[i+3];
5148 (out+j)[4] = (uint)in[i+4];
5149 (out+j)[5] = (uint)in[i+5];
5161 uint * restrict out = (uint* restrict)_out;
5167 (out+j)[0] = (uint)in[i+0];
5168 (out+j)[1] = (uint)in[i+1];
5169 (out+j)[2] = (uint)in[i+2];
5170 (out+j)[3] = (uint)in[i+3];
5171 (out+j)[4] = (uint)in[i+4];
5172 (out+j)[5] = (uint)in[i+5];
5175 (out+j)[0] = (uint)in[i+2];
5176 (out+j)[1] = (uint)in[i-2];
5177 (out+j)[2] = (uint)in[i+0];
5178 (out+j)[3] = (uint)in[i+3];
5179 (out+j)[4] = (uint)in[i+4];
5180 (out+j)[5] = (uint)in[i+6];
5193 uint * restrict out = (uint* restrict)_out;
5197 (out+j)[0] = (uint)in[i];
5209 uint * restrict out = (uint* restrict)_out;
5213 (out+j)[0] = (uint)in[i+1];
5214 (out+j)[1] = (uint)in[i];
5226 uint * restrict out = (uint* restrict)_out;
5230 (out+j)[0] = (uint)in[i+1];
5231 (out+j)[1] = (uint)in[i];
5243 uint * restrict out = (uint* restrict)_out;
5248 (out+j)[0] = (uint)in[i+1];
5249 (out+j)[1] = (uint)in[i];
5252 (out+j)[0] = (uint)in[start];
5253 (out+j)[1] = (uint)in[end];
5264 uint * restrict out = (uint* restrict)_out;
5268 (out+j)[0] = (uint)in[i+1];
5269 (out+j)[1] = (uint)in[i+2];
5270 (out+j)[2] = (uint)in[i];
5282 uint * restrict out = (uint* restrict)_out;
5286 (out+j)[0] = (uint)in[i+1+(i&1)];
5287 (out+j)[1] = (uint)in[i+2-(i&1)];
5288 (out+j)[2] = (uint)in[i];
5300 uint * restrict out = (uint* restrict)_out;
5304 (out+j)[0] = (uint)in[i+2];
5305 (out+j)[1] = (uint)in[start];
5306 (out+j)[2] = (uint)in[i+1];
5318 uint * restrict out = (uint* restrict)_out;
5322 (out+j+0)[0] = (uint)in[i+1];
5323 (out+j+0)[1] = (uint)in[i+2];
5324 (out+j+0)[2] = (uint)in[i+0];
5325 (out+j+3)[0] = (uint)in[i+2];
5326 (out+j+3)[1] = (uint)in[i+3];
5327 (out+j+3)[2] = (uint)in[i+0];
5339 uint * restrict out = (uint* restrict)_out;
5343 (out+j+0)[0] = (uint)in[i+1];
5344 (out+j+0)[1] = (uint)in[i+3];
5345 (out+j+0)[2] = (uint)in[i+0];
5346 (out+j+3)[0] = (uint)in[i+3];
5347 (out+j+3)[1] = (uint)in[i+2];
5348 (out+j+3)[2] = (uint)in[i+0];
5360 uint * restrict out = (uint* restrict)_out;
5364 (out+j)[0] = (uint)in[i+1];
5365 (out+j)[1] = (uint)in[i+2];
5366 (out+j)[2] = (uint)in[start];
5378 uint * restrict out = (uint* restrict)_out;
5382 (out+j)[0] = (uint)in[i+3];
5383 (out+j)[1] = (uint)in[i+2];
5384 (out+j)[2] = (uint)in[i+1];
5385 (out+j)[3] = (uint)in[i+0];
5397 uint * restrict out = (uint* restrict)_out;
5401 (out+j)[0] = (uint)in[i+3];
5402 (out+j)[1] = (uint)in[i+2];
5403 (out+j)[2] = (uint)in[i+1];
5404 (out+j)[3] = (uint)in[i+0];
5416 uint * restrict out = (uint* restrict)_out;
5420 (out+j)[0] = (uint)in[i+4];
5421 (out+j)[1] = (uint)in[i+5];
5422 (out+j)[2] = (uint)in[i+0];
5423 (out+j)[3] = (uint)in[i+1];
5424 (out+j)[4] = (uint)in[i+2];
5425 (out+j)[5] = (uint)in[i+3];
5437 uint * restrict out = (uint* restrict)_out;
5443 (out+j)[0] = (uint)in[i+4];
5444 (out+j)[1] = (uint)in[i+5];
5445 (out+j)[2] = (uint)in[i+0];
5446 (out+j)[3] = (uint)in[i+1];
5447 (out+j)[4] = (uint)in[i+2];
5448 (out+j)[5] = (uint)in[i+3];
5451 (out+j)[0] = (uint)in[i+4];
5452 (out+j)[1] = (uint)in[i+6];
5453 (out+j)[2] = (uint)in[i+2];
5454 (out+j)[3] = (uint)in[i-2];
5455 (out+j)[4] = (uint)in[i+0];
5456 (out+j)[5] = (uint)in[i+3];
5469 uint * restrict out = (uint* restrict)_out;
5473 (out+j)[0] = (uint)in[i];
5485 uint * restrict out = (uint* restrict)_out;
5489 (out+j)[0] = (uint)in[i+1];
5490 (out+j)[1] = (uint)in[i];
5502 uint * restrict out = (uint* restrict)_out;
5506 (out+j)[0] = (uint)in[i+1];
5507 (out+j)[1] = (uint)in[i];
5519 uint * restrict out = (uint* restrict)_out;
5526 (out+j+0)[0] = restart_index;
5527 (out+j+0)[1] = restart_index;
5532 (out+j)[0] = (uint)in[start];
5533 (out+j)[1] = (uint)in[end];
5541 (out+j)[0] = (uint)in[start];
5542 (out+j)[1] = (uint)in[end];
5548 (out+j)[0] = (uint)in[i+1];
5549 (out+j)[1] = (uint)in[i];
5552 (out+j)[0] = (uint)in[start];
5553 (out+j)[1] = (uint)in[end];
5564 uint * restrict out = (uint* restrict)_out;
5568 (out+j)[0] = (uint)in[i+1];
5569 (out+j)[1] = (uint)in[i+2];
5570 (out+j)[2] = (uint)in[i];
5582 uint * restrict out = (uint* restrict)_out;
5586 (out+j)[0] = (uint)in[i+1+(i&1)];
5587 (out+j)[1] = (uint)in[i+2-(i&1)];
5588 (out+j)[2] = (uint)in[i];
5600 uint * restrict out = (uint* restrict)_out;
5606 (out+j+0)[0] = restart_index;
5607 (out+j+0)[1] = restart_index;
5608 (out+j+0)[2] = restart_index;
5626 (out+j)[0] = (uint)in[i+2];
5627 (out+j)[1] = (uint)in[start];
5628 (out+j)[2] = (uint)in[i+1];
5640 uint * restrict out = (uint* restrict)_out;
5646 (out+j+0)[0] = restart_index;
5647 (out+j+0)[1] = restart_index;
5648 (out+j+0)[2] = restart_index;
5649 (out+j+3)[0] = restart_index;
5650 (out+j+3)[1] = restart_index;
5651 (out+j+3)[2] = restart_index;
5670 (out+j+0)[0] = (uint)in[i+1];
5671 (out+j+0)[1] = (uint)in[i+2];
5672 (out+j+0)[2] = (uint)in[i+0];
5673 (out+j+3)[0] = (uint)in[i+2];
5674 (out+j+3)[1] = (uint)in[i+3];
5675 (out+j+3)[2] = (uint)in[i+0];
5687 uint * restrict out = (uint* restrict)_out;
5693 (out+j+0)[0] = restart_index;
5694 (out+j+0)[1] = restart_index;
5695 (out+j+0)[2] = restart_index;
5696 (out+j+3)[0] = restart_index;
5697 (out+j+3)[1] = restart_index;
5698 (out+j+3)[2] = restart_index;
5717 (out+j+0)[0] = (uint)in[i+1];
5718 (out+j+0)[1] = (uint)in[i+3];
5719 (out+j+0)[2] = (uint)in[i+0];
5720 (out+j+3)[0] = (uint)in[i+3];
5721 (out+j+3)[1] = (uint)in[i+2];
5722 (out+j+3)[2] = (uint)in[i+0];
5734 uint * restrict out = (uint* restrict)_out;
5740 (out+j+0)[0] = restart_index;
5741 (out+j+0)[1] = restart_index;
5742 (out+j+0)[2] = restart_index;
5760 (out+j)[0] = (uint)in[i+1];
5761 (out+j)[1] = (uint)in[i+2];
5762 (out+j)[2] = (uint)in[start];
5774 uint * restrict out = (uint* restrict)_out;
5778 (out+j)[0] = (uint)in[i+3];
5779 (out+j)[1] = (uint)in[i+2];
5780 (out+j)[2] = (uint)in[i+1];
5781 (out+j)[3] = (uint)in[i+0];
5793 uint * restrict out = (uint* restrict)_out;
5797 (out+j)[0] = (uint)in[i+3];
5798 (out+j)[1] = (uint)in[i+2];
5799 (out+j)[2] = (uint)in[i+1];
5800 (out+j)[3] = (uint)in[i+0];
5812 uint * restrict out = (uint* restrict)_out;
5816 (out+j)[0] = (uint)in[i+4];
5817 (out+j)[1] = (uint)in[i+5];
5818 (out+j)[2] = (uint)in[i+0];
5819 (out+j)[3] = (uint)in[i+1];
5820 (out+j)[4] = (uint)in[i+2];
5821 (out+j)[5] = (uint)in[i+3];
5833 uint * restrict out = (uint* restrict)_out;
5839 (out+j)[0] = (uint)in[i+4];
5840 (out+j)[1] = (uint)in[i+5];
5841 (out+j)[2] = (uint)in[i+0];
5842 (out+j)[3] = (uint)in[i+1];
5843 (out+j)[4] = (uint)in[i+2];
5844 (out+j)[5] = (uint)in[i+3];
5847 (out+j)[0] = (uint)in[i+4];
5848 (out+j)[1] = (uint)in[i+6];
5849 (out+j)[2] = (uint)in[i+2];
5850 (out+j)[3] = (uint)in[i-2];
5851 (out+j)[4] = (uint)in[i+0];
5852 (out+j)[5] = (uint)in[i+3];
5865 uint * restrict out = (uint* restrict)_out;
5869 (out+j)[0] = (uint)in[i];
5881 uint * restrict out = (uint* restrict)_out;
5885 (out+j)[0] = (uint)in[i+1];
5886 (out+j)[1] = (uint)in[i];
5898 uint * restrict out = (uint* restrict)_out;
5902 (out+j)[0] = (uint)in[i+1];
5903 (out+j)[1] = (uint)in[i];
5915 uint * restrict out = (uint* restrict)_out;
5920 (out+j)[0] = (uint)in[i+1];
5921 (out+j)[1] = (uint)in[i];
5924 (out+j)[0] = (uint)in[start];
5925 (out+j)[1] = (uint)in[end];
5936 uint * restrict out = (uint* restrict)_out;
5940 (out+j)[0] = (uint)in[i+2];
5941 (out+j)[1] = (uint)in[i];
5942 (out+j)[2] = (uint)in[i+1];
5954 uint * restrict out = (uint* restrict)_out;
5958 (out+j)[0] = (uint)in[i+2];
5959 (out+j)[1] = (uint)in[i+(i&1)];
5960 (out+j)[2] = (uint)in[i+1-(i&1)];
5972 uint * restrict out = (uint* restrict)_out;
5976 (out+j)[0] = (uint)in[i+2];
5977 (out+j)[1] = (uint)in[start];
5978 (out+j)[2] = (uint)in[i+1];
5990 uint * restrict out = (uint* restrict)_out;
5994 (out+j+0)[0] = (uint)in[i+3];
5995 (out+j+0)[1] = (uint)in[i+0];
5996 (out+j+0)[2] = (uint)in[i+1];
5997 (out+j+3)[0] = (uint)in[i+3];
5998 (out+j+3)[1] = (uint)in[i+1];
5999 (out+j+3)[2] = (uint)in[i+2];
6011 uint * restrict out = (uint* restrict)_out;
6015 (out+j+0)[0] = (uint)in[i+3];
6016 (out+j+0)[1] = (uint)in[i+2];
6017 (out+j+0)[2] = (uint)in[i+0];
6018 (out+j+3)[0] = (uint)in[i+3];
6019 (out+j+3)[1] = (uint)in[i+0];
6020 (out+j+3)[2] = (uint)in[i+1];
6032 uint * restrict out = (uint* restrict)_out;
6036 (out+j)[0] = (uint)in[start];
6037 (out+j)[1] = (uint)in[i+1];
6038 (out+j)[2] = (uint)in[i+2];
6050 uint * restrict out = (uint* restrict)_out;
6054 (out+j)[0] = (uint)in[i+3];
6055 (out+j)[1] = (uint)in[i+2];
6056 (out+j)[2] = (uint)in[i+1];
6057 (out+j)[3] = (uint)in[i+0];
6069 uint * restrict out = (uint* restrict)_out;
6073 (out+j)[0] = (uint)in[i+3];
6074 (out+j)[1] = (uint)in[i+2];
6075 (out+j)[2] = (uint)in[i+1];
6076 (out+j)[3] = (uint)in[i+0];
6088 uint * restrict out = (uint* restrict)_out;
6092 (out+j)[0] = (uint)in[i+4];
6093 (out+j)[1] = (uint)in[i+5];
6094 (out+j)[2] = (uint)in[i+0];
6095 (out+j)[3] = (uint)in[i+1];
6096 (out+j)[4] = (uint)in[i+2];
6097 (out+j)[5] = (uint)in[i+3];
6109 uint * restrict out = (uint* restrict)_out;
6115 (out+j)[0] = (uint)in[i+4];
6116 (out+j)[1] = (uint)in[i+5];
6117 (out+j)[2] = (uint)in[i+0];
6118 (out+j)[3] = (uint)in[i+1];
6119 (out+j)[4] = (uint)in[i+2];
6120 (out+j)[5] = (uint)in[i+3];
6123 (out+j)[0] = (uint)in[i+4];
6124 (out+j)[1] = (uint)in[i+6];
6125 (out+j)[2] = (uint)in[i+2];
6126 (out+j)[3] = (uint)in[i-2];
6127 (out+j)[4] = (uint)in[i+0];
6128 (out+j)[5] = (uint)in[i+3];
6141 uint * restrict out = (uint* restrict)_out;
6145 (out+j)[0] = (uint)in[i];
6157 uint * restrict out = (uint* restrict)_out;
6161 (out+j)[0] = (uint)in[i+1];
6162 (out+j)[1] = (uint)in[i];
6174 uint * restrict out = (uint* restrict)_out;
6178 (out+j)[0] = (uint)in[i+1];
6179 (out+j)[1] = (uint)in[i];
6191 uint * restrict out = (uint* restrict)_out;
6198 (out+j+0)[0] = restart_index;
6199 (out+j+0)[1] = restart_index;
6204 (out+j)[0] = (uint)in[start];
6205 (out+j)[1] = (uint)in[end];
6213 (out+j)[0] = (uint)in[start];
6214 (out+j)[1] = (uint)in[end];
6220 (out+j)[0] = (uint)in[i+1];
6221 (out+j)[1] = (uint)in[i];
6224 (out+j)[0] = (uint)in[start];
6225 (out+j)[1] = (uint)in[end];
6236 uint * restrict out = (uint* restrict)_out;
6240 (out+j)[0] = (uint)in[i+2];
6241 (out+j)[1] = (uint)in[i];
6242 (out+j)[2] = (uint)in[i+1];
6254 uint * restrict out = (uint* restrict)_out;
6258 (out+j)[0] = (uint)in[i+2];
6259 (out+j)[1] = (uint)in[i+(i&1)];
6260 (out+j)[2] = (uint)in[i+1-(i&1)];
6272 uint * restrict out = (uint* restrict)_out;
6278 (out+j+0)[0] = restart_index;
6279 (out+j+0)[1] = restart_index;
6280 (out+j+0)[2] = restart_index;
6298 (out+j)[0] = (uint)in[i+2];
6299 (out+j)[1] = (uint)in[start];
6300 (out+j)[2] = (uint)in[i+1];
6312 uint * restrict out = (uint* restrict)_out;
6318 (out+j+0)[0] = restart_index;
6319 (out+j+0)[1] = restart_index;
6320 (out+j+0)[2] = restart_index;
6321 (out+j+3)[0] = restart_index;
6322 (out+j+3)[1] = restart_index;
6323 (out+j+3)[2] = restart_index;
6342 (out+j+0)[0] = (uint)in[i+3];
6343 (out+j+0)[1] = (uint)in[i+0];
6344 (out+j+0)[2] = (uint)in[i+1];
6345 (out+j+3)[0] = (uint)in[i+3];
6346 (out+j+3)[1] = (uint)in[i+1];
6347 (out+j+3)[2] = (uint)in[i+2];
6359 uint * restrict out = (uint* restrict)_out;
6365 (out+j+0)[0] = restart_index;
6366 (out+j+0)[1] = restart_index;
6367 (out+j+0)[2] = restart_index;
6368 (out+j+3)[0] = restart_index;
6369 (out+j+3)[1] = restart_index;
6370 (out+j+3)[2] = restart_index;
6389 (out+j+0)[0] = (uint)in[i+3];
6390 (out+j+0)[1] = (uint)in[i+2];
6391 (out+j+0)[2] = (uint)in[i+0];
6392 (out+j+3)[0] = (uint)in[i+3];
6393 (out+j+3)[1] = (uint)in[i+0];
6394 (out+j+3)[2] = (uint)in[i+1];
6406 uint * restrict out = (uint* restrict)_out;
6412 (out+j+0)[0] = restart_index;
6413 (out+j+0)[1] = restart_index;
6414 (out+j+0)[2] = restart_index;
6432 (out+j)[0] = (uint)in[start];
6433 (out+j)[1] = (uint)in[i+1];
6434 (out+j)[2] = (uint)in[i+2];
6446 uint * restrict out = (uint* restrict)_out;
6450 (out+j)[0] = (uint)in[i+3];
6451 (out+j)[1] = (uint)in[i+2];
6452 (out+j)[2] = (uint)in[i+1];
6453 (out+j)[3] = (uint)in[i+0];
6465 uint * restrict out = (uint* restrict)_out;
6469 (out+j)[0] = (uint)in[i+3];
6470 (out+j)[1] = (uint)in[i+2];
6471 (out+j)[2] = (uint)in[i+1];
6472 (out+j)[3] = (uint)in[i+0];
6484 uint * restrict out = (uint* restrict)_out;
6488 (out+j)[0] = (uint)in[i+4];
6489 (out+j)[1] = (uint)in[i+5];
6490 (out+j)[2] = (uint)in[i+0];
6491 (out+j)[3] = (uint)in[i+1];
6492 (out+j)[4] = (uint)in[i+2];
6493 (out+j)[5] = (uint)in[i+3];
6505 uint * restrict out = (uint* restrict)_out;
6511 (out+j)[0] = (uint)in[i+4];
6512 (out+j)[1] = (uint)in[i+5];
6513 (out+j)[2] = (uint)in[i+0];
6514 (out+j)[3] = (uint)in[i+1];
6515 (out+j)[4] = (uint)in[i+2];
6516 (out+j)[5] = (uint)in[i+3];
6519 (out+j)[0] = (uint)in[i+4];
6520 (out+j)[1] = (uint)in[i+6];
6521 (out+j)[2] = (uint)in[i+2];
6522 (out+j)[3] = (uint)in[i-2];
6523 (out+j)[4] = (uint)in[i+0];
6524 (out+j)[5] = (uint)in[i+3];
6537 uint * restrict out = (uint* restrict)_out;
6541 (out+j)[0] = (uint)in[i];
6553 uint * restrict out = (uint* restrict)_out;
6557 (out+j)[0] = (uint)in[i];
6558 (out+j)[1] = (uint)in[i+1];
6570 uint * restrict out = (uint* restrict)_out;
6574 (out+j)[0] = (uint)in[i];
6575 (out+j)[1] = (uint)in[i+1];
6587 uint * restrict out = (uint* restrict)_out;
6592 (out+j)[0] = (uint)in[i];
6593 (out+j)[1] = (uint)in[i+1];
6596 (out+j)[0] = (uint)in[end];
6597 (out+j)[1] = (uint)in[start];
6608 uint * restrict out = (uint* restrict)_out;
6612 (out+j)[0] = (uint)in[i];
6613 (out+j)[1] = (uint)in[i+1];
6614 (out+j)[2] = (uint)in[i+2];
6626 uint * restrict out = (uint* restrict)_out;
6630 (out+j)[0] = (uint)in[i+(i&1)];
6631 (out+j)[1] = (uint)in[i+1-(i&1)];
6632 (out+j)[2] = (uint)in[i+2];
6644 uint * restrict out = (uint* restrict)_out;
6648 (out+j)[0] = (uint)in[start];
6649 (out+j)[1] = (uint)in[i+1];
6650 (out+j)[2] = (uint)in[i+2];
6662 uint * restrict out = (uint* restrict)_out;
6666 (out+j+0)[0] = (uint)in[i+0];
6667 (out+j+0)[1] = (uint)in[i+1];
6668 (out+j+0)[2] = (uint)in[i+3];
6669 (out+j+3)[0] = (uint)in[i+1];
6670 (out+j+3)[1] = (uint)in[i+2];
6671 (out+j+3)[2] = (uint)in[i+3];
6683 uint * restrict out = (uint* restrict)_out;
6687 (out+j+0)[0] = (uint)in[i+2];
6688 (out+j+0)[1] = (uint)in[i+0];
6689 (out+j+0)[2] = (uint)in[i+3];
6690 (out+j+3)[0] = (uint)in[i+0];
6691 (out+j+3)[1] = (uint)in[i+1];
6692 (out+j+3)[2] = (uint)in[i+3];
6704 uint * restrict out = (uint* restrict)_out;
6708 (out+j)[0] = (uint)in[i+1];
6709 (out+j)[1] = (uint)in[i+2];
6710 (out+j)[2] = (uint)in[start];
6722 uint * restrict out = (uint* restrict)_out;
6726 (out+j)[0] = (uint)in[i+0];
6727 (out+j)[1] = (uint)in[i+1];
6728 (out+j)[2] = (uint)in[i+2];
6729 (out+j)[3] = (uint)in[i+3];
6741 uint * restrict out = (uint* restrict)_out;
6745 (out+j)[0] = (uint)in[i+0];
6746 (out+j)[1] = (uint)in[i+1];
6747 (out+j)[2] = (uint)in[i+2];
6748 (out+j)[3] = (uint)in[i+3];
6760 uint * restrict out = (uint* restrict)_out;
6764 (out+j)[0] = (uint)in[i+0];
6765 (out+j)[1] = (uint)in[i+1];
6766 (out+j)[2] = (uint)in[i+2];
6767 (out+j)[3] = (uint)in[i+3];
6768 (out+j)[4] = (uint)in[i+4];
6769 (out+j)[5] = (uint)in[i+5];
6781 uint * restrict out = (uint* restrict)_out;
6787 (out+j)[0] = (uint)in[i+0];
6788 (out+j)[1] = (uint)in[i+1];
6789 (out+j)[2] = (uint)in[i+2];
6790 (out+j)[3] = (uint)in[i+3];
6791 (out+j)[4] = (uint)in[i+4];
6792 (out+j)[5] = (uint)in[i+5];
6795 (out+j)[0] = (uint)in[i+2];
6796 (out+j)[1] = (uint)in[i-2];
6797 (out+j)[2] = (uint)in[i+0];
6798 (out+j)[3] = (uint)in[i+3];
6799 (out+j)[4] = (uint)in[i+4];
6800 (out+j)[5] = (uint)in[i+6];
6813 uint * restrict out = (uint* restrict)_out;
6817 (out+j)[0] = (uint)in[i];
6829 uint * restrict out = (uint* restrict)_out;
6833 (out+j)[0] = (uint)in[i];
6834 (out+j)[1] = (uint)in[i+1];
6846 uint * restrict out = (uint* restrict)_out;
6850 (out+j)[0] = (uint)in[i];
6851 (out+j)[1] = (uint)in[i+1];
6863 uint * restrict out = (uint* restrict)_out;
6870 (out+j+0)[0] = restart_index;
6871 (out+j+0)[1] = restart_index;
6876 (out+j)[0] = (uint)in[end];
6877 (out+j)[1] = (uint)in[start];
6885 (out+j)[0] = (uint)in[end];
6886 (out+j)[1] = (uint)in[start];
6892 (out+j)[0] = (uint)in[i];
6893 (out+j)[1] = (uint)in[i+1];
6896 (out+j)[0] = (uint)in[end];
6897 (out+j)[1] = (uint)in[start];
6908 uint * restrict out = (uint* restrict)_out;
6912 (out+j)[0] = (uint)in[i];
6913 (out+j)[1] = (uint)in[i+1];
6914 (out+j)[2] = (uint)in[i+2];
6926 uint * restrict out = (uint* restrict)_out;
6930 (out+j)[0] = (uint)in[i+(i&1)];
6931 (out+j)[1] = (uint)in[i+1-(i&1)];
6932 (out+j)[2] = (uint)in[i+2];
6944 uint * restrict out = (uint* restrict)_out;
6950 (out+j+0)[0] = restart_index;
6951 (out+j+0)[1] = restart_index;
6952 (out+j+0)[2] = restart_index;
6970 (out+j)[0] = (uint)in[start];
6971 (out+j)[1] = (uint)in[i+1];
6972 (out+j)[2] = (uint)in[i+2];
6984 uint * restrict out = (uint* restrict)_out;
6990 (out+j+0)[0] = restart_index;
6991 (out+j+0)[1] = restart_index;
6992 (out+j+0)[2] = restart_index;
6993 (out+j+3)[0] = restart_index;
6994 (out+j+3)[1] = restart_index;
6995 (out+j+3)[2] = restart_index;
7014 (out+j+0)[0] = (uint)in[i+0];
7015 (out+j+0)[1] = (uint)in[i+1];
7016 (out+j+0)[2] = (uint)in[i+3];
7017 (out+j+3)[0] = (uint)in[i+1];
7018 (out+j+3)[1] = (uint)in[i+2];
7019 (out+j+3)[2] = (uint)in[i+3];
7031 uint * restrict out = (uint* restrict)_out;
7037 (out+j+0)[0] = restart_index;
7038 (out+j+0)[1] = restart_index;
7039 (out+j+0)[2] = restart_index;
7040 (out+j+3)[0] = restart_index;
7041 (out+j+3)[1] = restart_index;
7042 (out+j+3)[2] = restart_index;
7061 (out+j+0)[0] = (uint)in[i+2];
7062 (out+j+0)[1] = (uint)in[i+0];
7063 (out+j+0)[2] = (uint)in[i+3];
7064 (out+j+3)[0] = (uint)in[i+0];
7065 (out+j+3)[1] = (uint)in[i+1];
7066 (out+j+3)[2] = (uint)in[i+3];
7078 uint * restrict out = (uint* restrict)_out;
7084 (out+j+0)[0] = restart_index;
7085 (out+j+0)[1] = restart_index;
7086 (out+j+0)[2] = restart_index;
7104 (out+j)[0] = (uint)in[i+1];
7105 (out+j)[1] = (uint)in[i+2];
7106 (out+j)[2] = (uint)in[start];
7118 uint * restrict out = (uint* restrict)_out;
7122 (out+j)[0] = (uint)in[i+0];
7123 (out+j)[1] = (uint)in[i+1];
7124 (out+j)[2] = (uint)in[i+2];
7125 (out+j)[3] = (uint)in[i+3];
7137 uint * restrict out = (uint* restrict)_out;
7141 (out+j)[0] = (uint)in[i+0];
7142 (out+j)[1] = (uint)in[i+1];
7143 (out+j)[2] = (uint)in[i+2];
7144 (out+j)[3] = (uint)in[i+3];
7156 uint * restrict out = (uint* restrict)_out;
7160 (out+j)[0] = (uint)in[i+0];
7161 (out+j)[1] = (uint)in[i+1];
7162 (out+j)[2] = (uint)in[i+2];
7163 (out+j)[3] = (uint)in[i+3];
7164 (out+j)[4] = (uint)in[i+4];
7165 (out+j)[5] = (uint)in[i+5];
7177 uint * restrict out = (uint* restrict)_out;
7183 (out+j)[0] = (uint)in[i+0];
7184 (out+j)[1] = (uint)in[i+1];
7185 (out+j)[2] = (uint)in[i+2];
7186 (out+j)[3] = (uint)in[i+3];
7187 (out+j)[4] = (uint)in[i+4];
7188 (out+j)[5] = (uint)in[i+5];
7191 (out+j)[0] = (uint)in[i+2];
7192 (out+j)[1] = (uint)in[i-2];
7193 (out+j)[2] = (uint)in[i+0];
7194 (out+j)[3] = (uint)in[i+3];
7195 (out+j)[4] = (uint)in[i+4];
7196 (out+j)[5] = (uint)in[i+6];
7209 ushort * restrict out = (ushort* restrict)_out;
7213 (out+j)[0] = (ushort)in[i];
7225 ushort * restrict out = (ushort* restrict)_out;
7229 (out+j)[0] = (ushort)in[i];
7230 (out+j)[1] = (ushort)in[i+1];
7242 ushort * restrict out = (ushort* restrict)_out;
7246 (out+j)[0] = (ushort)in[i];
7247 (out+j)[1] = (ushort)in[i+1];
7259 ushort * restrict out = (ushort* restrict)_out;
7264 (out+j)[0] = (ushort)in[i];
7265 (out+j)[1] = (ushort)in[i+1];
7268 (out+j)[0] = (ushort)in[end];
7269 (out+j)[1] = (ushort)in[start];
7280 ushort * restrict out = (ushort* restrict)_out;
7284 (out+j)[0] = (ushort)in[i];
7285 (out+j)[1] = (ushort)in[i+1];
7286 (out+j)[2] = (ushort)in[i+2];
7298 ushort * restrict out = (ushort* restrict)_out;
7302 (out+j)[0] = (ushort)in[i];
7303 (out+j)[1] = (ushort)in[i+1+(i&1)];
7304 (out+j)[2] = (ushort)in[i+2-(i&1)];
7316 ushort * restrict out = (ushort* restrict)_out;
7320 (out+j)[0] = (ushort)in[i+1];
7321 (out+j)[1] = (ushort)in[i+2];
7322 (out+j)[2] = (ushort)in[start];
7334 ushort * restrict out = (ushort* restrict)_out;
7338 (out+j+0)[0] = (ushort)in[i+0];
7339 (out+j+0)[1] = (ushort)in[i+1];
7340 (out+j+0)[2] = (ushort)in[i+2];
7341 (out+j+3)[0] = (ushort)in[i+0];
7342 (out+j+3)[1] = (ushort)in[i+2];
7343 (out+j+3)[2] = (ushort)in[i+3];
7355 ushort * restrict out = (ushort* restrict)_out;
7359 (out+j+0)[0] = (ushort)in[i+0];
7360 (out+j+0)[1] = (ushort)in[i+1];
7361 (out+j+0)[2] = (ushort)in[i+3];
7362 (out+j+3)[0] = (ushort)in[i+0];
7363 (out+j+3)[1] = (ushort)in[i+3];
7364 (out+j+3)[2] = (ushort)in[i+2];
7376 ushort * restrict out = (ushort* restrict)_out;
7380 (out+j)[0] = (ushort)in[start];
7381 (out+j)[1] = (ushort)in[i+1];
7382 (out+j)[2] = (ushort)in[i+2];
7394 ushort * restrict out = (ushort* restrict)_out;
7398 (out+j)[0] = (ushort)in[i+0];
7399 (out+j)[1] = (ushort)in[i+1];
7400 (out+j)[2] = (ushort)in[i+2];
7401 (out+j)[3] = (ushort)in[i+3];
7413 ushort * restrict out = (ushort* restrict)_out;
7417 (out+j)[0] = (ushort)in[i+0];
7418 (out+j)[1] = (ushort)in[i+1];
7419 (out+j)[2] = (ushort)in[i+2];
7420 (out+j)[3] = (ushort)in[i+3];
7432 ushort * restrict out = (ushort* restrict)_out;
7436 (out+j)[0] = (ushort)in[i+0];
7437 (out+j)[1] = (ushort)in[i+1];
7438 (out+j)[2] = (ushort)in[i+2];
7439 (out+j)[3] = (ushort)in[i+3];
7440 (out+j)[4] = (ushort)in[i+4];
7441 (out+j)[5] = (ushort)in[i+5];
7453 ushort * restrict out = (ushort* restrict)_out;
7459 (out+j)[0] = (ushort)in[i+0];
7460 (out+j)[1] = (ushort)in[i+1];
7461 (out+j)[2] = (ushort)in[i+2];
7462 (out+j)[3] = (ushort)in[i+3];
7463 (out+j)[4] = (ushort)in[i+4];
7464 (out+j)[5] = (ushort)in[i+5];
7467 (out+j)[0] = (ushort)in[i+2];
7468 (out+j)[1] = (ushort)in[i-2];
7469 (out+j)[2] = (ushort)in[i+0];
7470 (out+j)[3] = (ushort)in[i+3];
7471 (out+j)[4] = (ushort)in[i+4];
7472 (out+j)[5] = (ushort)in[i+6];
7485 ushort * restrict out = (ushort* restrict)_out;
7489 (out+j)[0] = (ushort)in[i];
7501 ushort * restrict out = (ushort* restrict)_out;
7505 (out+j)[0] = (ushort)in[i];
7506 (out+j)[1] = (ushort)in[i+1];
7518 ushort * restrict out = (ushort* restrict)_out;
7522 (out+j)[0] = (ushort)in[i];
7523 (out+j)[1] = (ushort)in[i+1];
7535 ushort * restrict out = (ushort* restrict)_out;
7542 (out+j+0)[0] = restart_index;
7543 (out+j+0)[1] = restart_index;
7548 (out+j)[0] = (ushort)in[end];
7549 (out+j)[1] = (ushort)in[start];
7557 (out+j)[0] = (ushort)in[end];
7558 (out+j)[1] = (ushort)in[start];
7564 (out+j)[0] = (ushort)in[i];
7565 (out+j)[1] = (ushort)in[i+1];
7568 (out+j)[0] = (ushort)in[end];
7569 (out+j)[1] = (ushort)in[start];
7580 ushort * restrict out = (ushort* restrict)_out;
7584 (out+j)[0] = (ushort)in[i];
7585 (out+j)[1] = (ushort)in[i+1];
7586 (out+j)[2] = (ushort)in[i+2];
7598 ushort * restrict out = (ushort* restrict)_out;
7602 (out+j)[0] = (ushort)in[i];
7603 (out+j)[1] = (ushort)in[i+1+(i&1)];
7604 (out+j)[2] = (ushort)in[i+2-(i&1)];
7616 ushort * restrict out = (ushort* restrict)_out;
7622 (out+j+0)[0] = restart_index;
7623 (out+j+0)[1] = restart_index;
7624 (out+j+0)[2] = restart_index;
7642 (out+j)[0] = (ushort)in[i+1];
7643 (out+j)[1] = (ushort)in[i+2];
7644 (out+j)[2] = (ushort)in[start];
7656 ushort * restrict out = (ushort* restrict)_out;
7662 (out+j+0)[0] = restart_index;
7663 (out+j+0)[1] = restart_index;
7664 (out+j+0)[2] = restart_index;
7665 (out+j+3)[0] = restart_index;
7666 (out+j+3)[1] = restart_index;
7667 (out+j+3)[2] = restart_index;
7686 (out+j+0)[0] = (ushort)in[i+0];
7687 (out+j+0)[1] = (ushort)in[i+1];
7688 (out+j+0)[2] = (ushort)in[i+2];
7689 (out+j+3)[0] = (ushort)in[i+0];
7690 (out+j+3)[1] = (ushort)in[i+2];
7691 (out+j+3)[2] = (ushort)in[i+3];
7703 ushort * restrict out = (ushort* restrict)_out;
7709 (out+j+0)[0] = restart_index;
7710 (out+j+0)[1] = restart_index;
7711 (out+j+0)[2] = restart_index;
7712 (out+j+3)[0] = restart_index;
7713 (out+j+3)[1] = restart_index;
7714 (out+j+3)[2] = restart_index;
7733 (out+j+0)[0] = (ushort)in[i+0];
7734 (out+j+0)[1] = (ushort)in[i+1];
7735 (out+j+0)[2] = (ushort)in[i+3];
7736 (out+j+3)[0] = (ushort)in[i+0];
7737 (out+j+3)[1] = (ushort)in[i+3];
7738 (out+j+3)[2] = (ushort)in[i+2];
7750 ushort * restrict out = (ushort* restrict)_out;
7756 (out+j+0)[0] = restart_index;
7757 (out+j+0)[1] = restart_index;
7758 (out+j+0)[2] = restart_index;
7776 (out+j)[0] = (ushort)in[start];
7777 (out+j)[1] = (ushort)in[i+1];
7778 (out+j)[2] = (ushort)in[i+2];
7790 ushort * restrict out
7794 (out+j)[0] = (ushort)in[i+0];
7795 (out+j)[1] = (ushort)in[i+1];
7796 (out+j)[2] = (ushort)in[i+2];
7797 (out+j)[3] = (ushort)in[i+3];
7809 ushort * restrict out = (ushort* restrict)_out;
7813 (out+j)[0] = (ushort)in[i+0];
7814 (out+j)[1] = (ushort)in[i+1];
7815 (out+j)[2] = (ushort)in[i+2];
7816 (out+j)[3] = (ushort)in[i+3];
7828 ushort * restrict out = (ushort* restrict)_out;
7832 (out+j)[0] = (ushort)in[i+0];
7833 (out+j)[1] = (ushort)in[i+1];
7834 (out+j)[2] = (ushort)in[i+2];
7835 (out+j)[3] = (ushort)in[i+3];
7836 (out+j)[4] = (ushort)in[i+4];
7837 (out+j)[5] = (ushort)in[i+5];
7849 ushort * restrict out = (ushort* restrict)_out;
7855 (out+j)[0] = (ushort)in[i+0];
7856 (out+j)[1] = (ushort)in[i+1];
7857 (out+j)[2] = (ushort)in[i+2];
7858 (out+j)[3] = (ushort)in[i+3];
7859 (out+j)[4] = (ushort)in[i+4];
7860 (out+j)[5] = (ushort)in[i+5];
7863 (out+j)[0] = (ushort)in[i+2];
7864 (out+j)[1] = (ushort)in[i-2];
7865 (out+j)[2] = (ushort)in[i+0];
7866 (out+j)[3] = (ushort)in[i+3];
7867 (out+j)[4] = (ushort)in[i+4];
7868 (out+j)[5] = (ushort)in[i+6];
7881 ushort * restrict out = (ushort* restrict)_out;
7885 (out+j)[0] = (ushort)in[i];
7897 ushort * restrict out = (ushort* restrict)_out;
7901 (out+j)[0] = (ushort)in[i+1];
7902 (out+j)[1] = (ushort)in[i];
7914 ushort * restrict out = (ushort* restrict)_out;
7918 (out+j)[0] = (ushort)in[i+1];
7919 (out+j)[1] = (ushort)in[i];
7931 ushort * restrict out = (ushort* restrict)_out;
7936 (out+j)[0] = (ushort)in[i+1];
7937 (out+j)[1] = (ushort)in[i];
7940 (out+j)[0] = (ushort)in[start];
7941 (out+j)[1] = (ushort)in[end];
7952 ushort * restrict out = (ushort* restrict)_out;
7956 (out+j)[0] = (ushort)in[i+1];
7957 (out+j)[1] = (ushort)in[i+2];
7958 (out+j)[2] = (ushort)in[i];
7970 ushort * restrict out = (ushort* restrict)_out;
7974 (out+j)[0] = (ushort)in[i+1+(i&1)];
7975 (out+j)[1] = (ushort)in[i+2-(i&1)];
7976 (out+j)[2] = (ushort)in[i];
7988 ushort * restrict out = (ushort* restrict)_out;
7992 (out+j)[0] = (ushort)in[i+2];
7993 (out+j)[1] = (ushort)in[start];
7994 (out+j)[2] = (ushort)in[i+1];
8006 ushort * restrict out = (ushort* restrict)_out;
8010 (out+j+0)[0] = (ushort)in[i+1];
8011 (out+j+0)[1] = (ushort)in[i+2];
8012 (out+j+0)[2] = (ushort)in[i+0];
8013 (out+j+3)[0] = (ushort)in[i+2];
8014 (out+j+3)[1] = (ushort)in[i+3];
8015 (out+j+3)[2] = (ushort)in[i+0];
8027 ushort * restrict out = (ushort* restrict)_out;
8031 (out+j+0)[0] = (ushort)in[i+1];
8032 (out+j+0)[1] = (ushort)in[i+3];
8033 (out+j+0)[2] = (ushort)in[i+0];
8034 (out+j+3)[0] = (ushort)in[i+3];
8035 (out+j+3)[1] = (ushort)in[i+2];
8036 (out+j+3)[2] = (ushort)in[i+0];
8048 ushort * restrict out = (ushort* restrict)_out;
8052 (out+j)[0] = (ushort)in[i+1];
8053 (out+j)[1] = (ushort)in[i+2];
8054 (out+j)[2] = (ushort)in[start];
8066 ushort * restrict out = (ushort* restrict)_out;
8070 (out+j)[0] = (ushort)in[i+3];
8071 (out+j)[1] = (ushort)in[i+2];
8072 (out+j)[2] = (ushort)in[i+1];
8073 (out+j)[3] = (ushort)in[i+0];
8085 ushort * restrict out = (ushort* restrict)_out;
8089 (out+j)[0] = (ushort)in[i+3];
8090 (out+j)[1] = (ushort)in[i+2];
8091 (out+j)[2] = (ushort)in[i+1];
8092 (out+j)[3] = (ushort)in[i+0];
8104 ushort * restrict out = (ushort* restrict)_out;
8108 (out+j)[0] = (ushort)in[i+4];
8109 (out+j)[1] = (ushort)in[i+5];
8110 (out+j)[2] = (ushort)in[i+0];
8111 (out+j)[3] = (ushort)in[i+1];
8112 (out+j)[4] = (ushort)in[i+2];
8113 (out+j)[5] = (ushort)in[i+3];
8125 ushort * restrict out = (ushort* restrict)_out;
8131 (out+j)[0] = (ushort)in[i+4];
8132 (out+j)[1] = (ushort)in[i+5];
8133 (out+j)[2] = (ushort)in[i+0];
8134 (out+j)[3] = (ushort)in[i+1];
8135 (out+j)[4] = (ushort)in[i+2];
8136 (out+j)[5] = (ushort)in[i+3];
8139 (out+j)[0] = (ushort)in[i+4];
8140 (out+j)[1] = (ushort)in[i+6];
8141 (out+j)[2] = (ushort)in[i+2];
8142 (out+j)[3] = (ushort)in[i-2];
8143 (out+j)[4] = (ushort)in[i+0];
8144 (out+j)[5] = (ushort)in[i+3];
8157 ushort * restrict out = (ushort* restrict)_out;
8161 (out+j)[0] = (ushort)in[i];
8173 ushort * restrict out = (ushort* restrict)_out;
8177 (out+j)[0] = (ushort)in[i+1];
8178 (out+j)[1] = (ushort)in[i];
8190 ushort * restrict out = (ushort* restrict)_out;
8194 (out+j)[0] = (ushort)in[i+1];
8195 (out+j)[1] = (ushort)in[i];
8207 ushort * restrict out = (ushort* restrict)_out;
8214 (out+j+0)[0] = restart_index;
8215 (out+j+0)[1] = restart_index;
8220 (out+j)[0] = (ushort)in[start];
8221 (out+j)[1] = (ushort)in[end];
8229 (out+j)[0] = (ushort)in[start];
8230 (out+j)[1] = (ushort)in[end];
8236 (out+j)[0] = (ushort)in[i+1];
8237 (out+j)[1] = (ushort)in[i];
8240 (out+j)[0] = (ushort)in[start];
8241 (out+j)[1] = (ushort)in[end];
8252 ushort * restrict out = (ushort* restrict)_out;
8256 (out+j)[0] = (ushort)in[i+1];
8257 (out+j)[1] = (ushort)in[i+2];
8258 (out+j)[2] = (ushort)in[i];
8270 ushort * restrict out = (ushort* restrict)_out;
8274 (out+j)[0] = (ushort)in[i+1+(i&1)];
8275 (out+j)[1] = (ushort)in[i+2-(i&1)];
8276 (out+j)[2] = (ushort)in[i];
8288 ushort * restrict out = (ushort* restrict)_out;
8294 (out+j+0)[0] = restart_index;
8295 (out+j+0)[1] = restart_index;
8296 (out+j+0)[2] = restart_index;
8314 (out+j)[0] = (ushort)in[i+2];
8315 (out+j)[1] = (ushort)in[start];
8316 (out+j)[2] = (ushort)in[i+1];
8328 ushort * restrict out = (ushort* restrict)_out;
8334 (out+j+0)[0] = restart_index;
8335 (out+j+0)[1] = restart_index;
8336 (out+j+0)[2] = restart_index;
8337 (out+j+3)[0] = restart_index;
8338 (out+j+3)[1] = restart_index;
8339 (out+j+3)[2] = restart_index;
8358 (out+j+0)[0] = (ushort)in[i+1];
8359 (out+j+0)[1] = (ushort)in[i+2];
8360 (out+j+0)[2] = (ushort)in[i+0];
8361 (out+j+3)[0] = (ushort)in[i+2];
8362 (out+j+3)[1] = (ushort)in[i+3];
8363 (out+j+3)[2] = (ushort)in[i+0];
8375 ushort * restrict out = (ushort* restrict)_out;
8381 (out+j+0)[0] = restart_index;
8382 (out+j+0)[1] = restart_index;
8383 (out+j+0)[2] = restart_index;
8384 (out+j+3)[0] = restart_index;
8385 (out+j+3)[1] = restart_index;
8386 (out+j+3)[2] = restart_index;
8405 (out+j+0)[0] = (ushort)in[i+1];
8406 (out+j+0)[1] = (ushort)in[i+3];
8407 (out+j+0)[2] = (ushort)in[i+0];
8408 (out+j+3)[0] = (ushort)in[i+3];
8409 (out+j+3)[1] = (ushort)in[i+2];
8410 (out+j+3)[2] = (ushort)in[i+0];
8422 ushort * restrict out = (ushort* restrict)_out;
8428 (out+j+0)[0] = restart_index;
8429 (out+j+0)[1] = restart_index;
8430 (out+j+0)[2] = restart_index;
8448 (out+j)[0] = (ushort)in[i+1];
8449 (out+j)[1] = (ushort)in[i+2];
8450 (out+j)[2] = (ushort)in[start];
8462 ushort * restrict out = (ushort* restrict)_out;
8466 (out+j)[0] = (ushort)in[i+3];
8467 (out+j)[1] = (ushort)in[i+2];
8468 (out+j)[2] = (ushort)in[i+1];
8469 (out+j)[3] = (ushort)in[i+0];
8481 ushort * restrict out = (ushort* restrict)_out;
8485 (out+j)[0] = (ushort)in[i+3];
8486 (out+j)[1] = (ushort)in[i+2];
8487 (out+j)[2] = (ushort)in[i+1];
8488 (out+j)[3] = (ushort)in[i+0];
8500 ushort * restrict out = (ushort* restrict)_out;
8504 (out+j)[0] = (ushort)in[i+4];
8505 (out+j)[1] = (ushort)in[i+5];
8506 (out+j)[2] = (ushort)in[i+0];
8507 (out+j)[3] = (ushort)in[i+1];
8508 (out+j)[4] = (ushort)in[i+2];
8509 (out+j)[5] = (ushort)in[i+3];
8521 ushort * restrict out = (ushort* restrict)_out;
8527 (out+j)[0] = (ushort)in[i+4];
8528 (out+j)[1] = (ushort)in[i+5];
8529 (out+j)[2] = (ushort)in[i+0];
8530 (out+j)[3] = (ushort)in[i+1];
8531 (out+j)[4] = (ushort)in[i+2];
8532 (out+j)[5] = (ushort)in[i+3];
8535 (out+j)[0] = (ushort)in[i+4];
8536 (out+j)[1] = (ushort)in[i+6];
8537 (out+j)[2] = (ushort)in[i+2];
8538 (out+j)[3] = (ushort)in[i-2];
8539 (out+j)[4] = (ushort)in[i+0];
8540 (out+j)[5] = (ushort)in[i+3];
8553 ushort * restrict out = (ushort* restrict)_out;
8557 (out+j)[0] = (ushort)in[i];
8569 ushort * restrict out = (ushort* restrict)_out;
8573 (out+j)[0] = (ushort)in[i+1];
8574 (out+j)[1] = (ushort)in[i];
8586 ushort * restrict out = (ushort* restrict)_out;
8590 (out+j)[0] = (ushort)in[i+1];
8591 (out+j)[1] = (ushort)in[i];
8603 ushort * restrict out = (ushort* restrict)_out;
8608 (out+j)[0] = (ushort)in[i+1];
8609 (out+j)[1] = (ushort)in[i];
8612 (out+j)[0] = (ushort)in[start];
8613 (out+j)[1] = (ushort)in[end];
8624 ushort * restrict out = (ushort* restrict)_out;
8628 (out+j)[0] = (ushort)in[i+2];
8629 (out+j)[1] = (ushort)in[i];
8630 (out+j)[2] = (ushort)in[i+1];
8642 ushort * restrict out = (ushort* restrict)_out;
8646 (out+j)[0] = (ushort)in[i+2];
8647 (out+j)[1] = (ushort)in[i+(i&1)];
8648 (out+j)[2] = (ushort)in[i+1-(i&1)];
8660 ushort * restrict out = (ushort* restrict)_out;
8664 (out+j)[0] = (ushort)in[i+2];
8665 (out+j)[1] = (ushort)in[start];
8666 (out+j)[2] = (ushort)in[i+1];
8678 ushort * restrict out = (ushort* restrict)_out;
8682 (out+j+0)[0] = (ushort)in[i+3];
8683 (out+j+0)[1] = (ushort)in[i+0];
8684 (out+j+0)[2] = (ushort)in[i+1];
8685 (out+j+3)[0] = (ushort)in[i+3];
8686 (out+j+3)[1] = (ushort)in[i+1];
8687 (out+j+3)[2] = (ushort)in[i+2];
8699 ushort * restrict out = (ushort* restrict)_out;
8703 (out+j+0)[0] = (ushort)in[i+3];
8704 (out+j+0)[1] = (ushort)in[i+2];
8705 (out+j+0)[2] = (ushort)in[i+0];
8706 (out+j+3)[0] = (ushort)in[i+3];
8707 (out+j+3)[1] = (ushort)in[i+0];
8708 (out+j+3)[2] = (ushort)in[i+1];
8720 ushort * restrict out = (ushort* restrict)_out;
8724 (out+j)[0] = (ushort)in[start];
8725 (out+j)[1] = (ushort)in[i+1];
8726 (out+j)[2] = (ushort)in[i+2];
8738 ushort * restrict out = (ushort* restrict)_out;
8742 (out+j)[0] = (ushort)in[i+3];
8743 (out+j)[1] = (ushort)in[i+2];
8744 (out+j)[2] = (ushort)in[i+1];
8745 (out+j)[3] = (ushort)in[i+0];
8757 ushort * restrict out = (ushort* restrict)_out;
8761 (out+j)[0] = (ushort)in[i+3];
8762 (out+j)[1] = (ushort)in[i+2];
8763 (out+j)[2] = (ushort)in[i+1];
8764 (out+j)[3] = (ushort)in[i+0];
8776 ushort * restrict out = (ushort* restrict)_out;
8780 (out+j)[0] = (ushort)in[i+4];
8781 (out+j)[1] = (ushort)in[i+5];
8782 (out+j)[2] = (ushort)in[i+0];
8783 (out+j)[3] = (ushort)in[i+1];
8784 (out+j)[4] = (ushort)in[i+2];
8785 (out+j)[5] = (ushort)in[i+3];
8797 ushort * restrict out = (ushort* restrict)_out;
8803 (out+j)[0] = (ushort)in[i+4];
8804 (out+j)[1] = (ushort)in[i+5];
8805 (out+j)[2] = (ushort)in[i+0];
8806 (out+j)[3] = (ushort)in[i+1];
8807 (out+j)[4] = (ushort)in[i+2];
8808 (out+j)[5] = (ushort)in[i+3];
8811 (out+j)[0] = (ushort)in[i+4];
8812 (out+j)[1] = (ushort)in[i+6];
8813 (out+j)[2] = (ushort)in[i+2];
8814 (out+j)[3] = (ushort)in[i-2];
8815 (out+j)[4] = (ushort)in[i+0];
8816 (out+j)[5] = (ushort)in[i+3];
8829 ushort * restrict out = (ushort* restrict)_out;
8833 (out+j)[0] = (ushort)in[i];
8845 ushort * restrict out = (ushort* restrict)_out;
8849 (out+j)[0] = (ushort)in[i+1];
8850 (out+j)[1] = (ushort)in[i];
8862 ushort * restrict out = (ushort* restrict)_out;
8866 (out+j)[0] = (ushort)in[i+1];
8867 (out+j)[1] = (ushort)in[i];
8879 ushort * restrict out = (ushort* restrict)_out;
8886 (out+j+0)[0] = restart_index;
8887 (out+j+0)[1] = restart_index;
8892 (out+j)[0] = (ushort)in[start];
8893 (out+j)[1] = (ushort)in[end];
8901 (out+j)[0] = (ushort)in[start];
8902 (out+j)[1] = (ushort)in[end];
8908 (out+j)[0] = (ushort)in[i+1];
8909 (out+j)[1] = (ushort)in[i];
8912 (out+j)[0] = (ushort)in[start];
8913 (out+j)[1] = (ushort)in[end];
8924 ushort * restrict out = (ushort* restrict)_out;
8928 (out+j)[0] = (ushort)in[i+2];
8929 (out+j)[1] = (ushort)in[i];
8930 (out+j)[2] = (ushort)in[i+1];
8942 out = (ushort* restrict)_out;
8946 (out+j)[0] = (ushort)in[i+2];
8947 (out+j)[1] = (ushort)in[i+(i&1)];
8948 (out+j)[2] = (ushort)in[i+1-(i&1)];
8960 ushort * restrict out = (ushort* restrict)_out;
8966 (out+j+0)[0] = restart_index;
8967 (out+j+0)[1] = restart_index;
8968 (out+j+0)[2] = restart_index;
8986 (out+j)[0] = (ushort)in[i+2];
8987 (out+j)[1] = (ushort)in[start];
8988 (out+j)[2] = (ushort)in[i+1];
9000 ushort * restrict out = (ushort* restrict)_out;
9006 (out+j+0)[0] = restart_index;
9007 (out+j+0)[1] = restart_index;
9008 (out+j+0)[2] = restart_index;
9009 (out+j+3)[0] = restart_index;
9010 (out+j+3)[1] = restart_index;
9011 (out+j+3)[2] = restart_index;
9030 (out+j+0)[0] = (ushort)in[i+3];
9031 (out+j+0)[1] = (ushort)in[i+0];
9032 (out+j+0)[2] = (ushort)in[i+1];
9033 (out+j+3)[0] = (ushort)in[i+3];
9034 (out+j+3)[1] = (ushort)in[i+1];
9035 (out+j+3)[2] = (ushort)in[i+2];
9047 ushort * restrict out = (ushort* restrict)_out;
9053 (out+j+0)[0] = restart_index;
9054 (out+j+0)[1] = restart_index;
9055 (out+j+0)[2] = restart_index;
9056 (out+j+3)[0] = restart_index;
9057 (out+j+3)[1] = restart_index;
9058 (out+j+3)[2] = restart_index;
9077 (out+j+0)[0] = (ushort)in[i+3];
9078 (out+j+0)[1] = (ushort)in[i+2];
9079 (out+j+0)[2] = (ushort)in[i+0];
9080 (out+j+3)[0] = (ushort)in[i+3];
9081 (out+j+3)[1] = (ushort)in[i+0];
9082 (out+j+3)[2] = (ushort)in[i+1];
9094 ushort * restrict out = (ushort* restrict)_out;
9100 (out+j+0)[0] = restart_index;
9101 (out+j+0)[1] = restart_index;
9102 (out+j+0)[2] = restart_index;
9120 (out+j)[0] = (ushort)in[start];
9121 (out+j)[1] = (ushort)in[i+1];
9122 (out+j)[2] = (ushort)in[i+2];
9134 ushort * restrict out = (ushort* restrict)_out;
9138 (out+j)[0] = (ushort)in[i+3];
9139 (out+j)[1] = (ushort)in[i+2];
9140 (out+j)[2] = (ushort)in[i+1];
9141 (out+j)[3] = (ushort)in[i+0];
9153 ushort * restrict out = (ushort* restrict)_out;
9157 (out+j)[0] = (ushort)in[i+3];
9158 (out+j)[1] = (ushort)in[i+2];
9159 (out+j)[2] = (ushort)in[i+1];
9160 (out+j)[3] = (ushort)in[i+0];
9172 ushort * restrict out = (ushort* restrict)_out;
9176 (out+j)[0] = (ushort)in[i+4];
9177 (out+j)[1] = (ushort)in[i+5];
9178 (out+j)[2] = (ushort)in[i+0];
9179 (out+j)[3] = (ushort)in[i+1];
9180 (out+j)[4] = (ushort)in[i+2];
9181 (out+j)[5] = (ushort)in[i+3];
9193 ushort * restrict out = (ushort* restrict)_out;
9199 (out+j)[0] = (ushort)in[i+4];
9200 (out+j)[1] = (ushort)in[i+5];
9201 (out+j)[2] = (ushort)in[i+0];
9202 (out+j)[3] = (ushort)in[i+1];
9203 (out+j)[4] = (ushort)in[i+2];
9204 (out+j)[5] = (ushort)in[i+3];
9207 (out+j)[0] = (ushort)in[i+4];
9208 (out+j)[1] = (ushort)in[i+6];
9209 (out+j)[2] = (ushort)in[i+2];
9210 (out+j)[3] = (ushort)in[i-2];
9211 (out+j)[4] = (ushort)in[i+0];
9212 (out+j)[5] = (ushort)in[i+3];
9225 ushort * restrict out = (ushort* restrict)_out;
9229 (out+j)[0] = (ushort)in[i];
9241 ushort * restrict out = (ushort* restrict)_out;
9245 (out+j)[0] = (ushort)in[i];
9246 (out+j)[1] = (ushort)in[i+1];
9258 ushort * restrict out = (ushort* restrict)_out;
9262 (out+j)[0] = (ushort)in[i];
9263 (out+j)[1] = (ushort)in[i+1];
9275 ushort * restrict out = (ushort* restrict)_out;
9280 (out+j)[0] = (ushort)in[i];
9281 (out+j)[1] = (ushort)in[i+1];
9284 (out+j)[0] = (ushort)in[end];
9285 (out+j)[1] = (ushort)in[start];
9296 ushort * restrict out = (ushort* restrict)_out;
9300 (out+j)[0] = (ushort)in[i];
9301 (out+j)[1] = (ushort)in[i+1];
9302 (out+j)[2] = (ushort)in[i+2];
9314 ushort * restrict out = (ushort* restrict)_out;
9318 (out+j)[0] = (ushort)in[i+(i&1)];
9319 (out+j)[1] = (ushort)in[i+1-(i&1)];
9320 (out+j)[2] = (ushort)in[i+2];
9332 ushort * restrict out = (ushort* restrict)_out;
9336 (out+j)[0] = (ushort)in[start];
9337 (out+j)[1] = (ushort)in[i+1];
9338 (out+j)[2] = (ushort)in[i+2];
9350 ushort * restrict out = (ushort* restrict)_out;
9354 (out+j+0)[0] = (ushort)in[i+0];
9355 (out+j+0)[1] = (ushort)in[i+1];
9356 (out+j+0)[2] = (ushort)in[i+3];
9357 (out+j+3)[0] = (ushort)in[i+1];
9358 (out+j+3)[1] = (ushort)in[i+2];
9359 (out+j+3)[2] = (ushort)in[i+3];
9371 ushort * restrict out = (ushort* restrict)_out;
9375 (out+j+0)[0] = (ushort)in[i+2];
9376 (out+j+0)[1] = (ushort)in[i+0];
9377 (out+j+0)[2] = (ushort)in[i+3];
9378 (out+j+3)[0] = (ushort)in[i+0];
9379 (out+j+3)[1] = (ushort)in[i+1];
9380 (out+j+3)[2] = (ushort)in[i+3];
9392 ushort * restrict out = (ushort* restrict)_out;
9396 (out+j)[0] = (ushort)in[i+1];
9397 (out+j)[1] = (ushort)in[i+2];
9398 (out+j)[2] = (ushort)in[start];
9410 ushort * restrict out = (ushort* restrict)_out;
9414 (out+j)[0] = (ushort)in[i+0];
9415 (out+j)[1] = (ushort)in[i+1];
9416 (out+j)[2] = (ushort)in[i+2];
9417 (out+j)[3] = (ushort)in[i+3];
9429 ushort * restrict out = (ushort* restrict)_out;
9433 (out+j)[0] = (ushort)in[i+0];
9434 (out+j)[1] = (ushort)in[i+1];
9435 (out+j)[2] = (ushort)in[i+2];
9436 (out+j)[3] = (ushort)in[i+3];
9448 ushort * restrict out = (ushort* restrict)_out;
9452 (out+j)[0] = (ushort)in[i+0];
9453 (out+j)[1] = (ushort)in[i+1];
9454 (out+j)[2] = (ushort)in[i+2];
9455 (out+j)[3] = (ushort)in[i+3];
9456 (out+j)[4] = (ushort)in[i+4];
9457 (out+j)[5] = (ushort)in[i+5];
9469 ushort * restrict out = (ushort* restrict)_out;
9475 (out+j)[0] = (ushort)in[i+0];
9476 (out+j)[1] = (ushort)in[i+1];
9477 (out+j)[2] = (ushort)in[i+2];
9478 (out+j)[3] = (ushort)in[i+3];
9479 (out+j)[4] = (ushort)in[i+4];
9480 (out+j)[5] = (ushort)in[i+5];
9483 (out+j)[0] = (ushort)in[i+2];
9484 (out+j)[1] = (ushort)in[i-2];
9485 (out+j)[2] = (ushort)in[i+0];
9486 (out+j)[3] = (ushort)in[i+3];
9487 (out+j)[4] = (ushort)in[i+4];
9488 (out+j)[5] = (ushort)in[i+6];
9501 ushort * restrict out = (ushort* restrict)_out;
9505 (out+j)[0] = (ushort)in[i];
9517 ushort * restrict out = (ushort* restrict)_out;
9521 (out+j)[0] = (ushort)in[i];
9522 (out+j)[1] = (ushort)in[i+1];
9534 ushort * restrict out = (ushort* restrict)_out;
9538 (out+j)[0] = (ushort)in[i];
9539 (out+j)[1] = (ushort)in[i+1];
9551 ushort * restrict out = (ushort* restrict)_out;
9558 (out+j+0)[0] = restart_index;
9559 (out+j+0)[1] = restart_index;
9564 (out+j)[0] = (ushort)in[end];
9565 (out+j)[1] = (ushort)in[start];
9573 (out+j)[0] = (ushort)in[end];
9574 (out+j)[1] = (ushort)in[start];
9580 (out+j)[0] = (ushort)in[i];
9581 (out+j)[1] = (ushort)in[i+1];
9584 (out+j)[0] = (ushort)in[end];
9585 (out+j)[1] = (ushort)in[start];
9596 ushort * restrict out = (ushort* restrict)_out;
9600 (out+j)[0] = (ushort)in[i];
9601 (out+j)[1] = (ushort)in[i+1];
9602 (out+j)[2] = (ushort)in[i+2];
9614 ushort * restrict out = (ushort* restrict)_out;
9618 (out+j)[0] = (ushort)in[i+(i&1)];
9619 (out+j)[1] = (ushort)in[i+1-(i&1)];
9620 (out+j)[2] = (ushort)in[i+2];
9632 ushort * restrict out = (ushort* restrict)_out;
9638 (out+j+0)[0] = restart_index;
9639 (out+j+0)[1] = restart_index;
9640 (out+j+0)[2] = restart_index;
9658 (out+j)[0] = (ushort)in[start];
9659 (out+j)[1] = (ushort)in[i+1];
9660 (out+j)[2] = (ushort)in[i+2];
9672 ushort * restrict out = (ushort* restrict)_out;
9678 (out+j+0)[0] = restart_index;
9679 (out+j+0)[1] = restart_index;
9680 (out+j+0)[2] = restart_index;
9681 (out+j+3)[0] = restart_index;
9682 (out+j+3)[1] = restart_index;
9683 (out+j+3)[2] = restart_index;
9702 (out+j+0)[0] = (ushort)in[i+0];
9703 (out+j+0)[1] = (ushort)in[i+1];
9704 (out+j+0)[2] = (ushort)in[i+3];
9705 (out+j+3)[0] = (ushort)in[i+1];
9706 (out+j+3)[1] = (ushort)in[i+2];
9707 (out+j+3)[2] = (ushort)in[i+3];
9719 ushort * restrict out = (ushort* restrict)_out;
9725 (out+j+0)[0] = restart_index;
9726 (out+j+0)[1] = restart_index;
9727 (out+j+0)[2] = restart_index;
9728 (out+j+3)[0] = restart_index;
9729 (out+j+3)[1] = restart_index;
9730 (out+j+3)[2] = restart_index;
9749 (out+j+0)[0] = (ushort)in[i+2];
9750 (out+j+0)[1] = (ushort)in[i+0];
9751 (out+j+0)[2] = (ushort)in[i+3];
9752 (out+j+3)[0] = (ushort)in[i+0];
9753 (out+j+3)[1] = (ushort)in[i+1];
9754 (out+j+3)[2] = (ushort)in[i+3];
9766 ushort * restrict out = (ushort* restrict)_out;
9772 (out+j+0)[0] = restart_index;
9773 (out+j+0)[1] = restart_index;
9774 (out+j+0)[2] = restart_index;
9792 (out+j)[0] = (ushort)in[i+1];
9793 (out+j)[1] = (ushort)in[i+2];
9794 (out+j)[2] = (ushort)in[start];
9806 ushort * restrict out = (ushort* restrict)_out;
9810 (out+j)[0] = (ushort)in[i+0];
9811 (out+j)[1] = (ushort)in[i+1];
9812 (out+j)[2] = (ushort)in[i+2];
9813 (out+j)[3] = (ushort)in[i+3];
9825 ushort * restrict out = (ushort* restrict)_out;
9829 (out+j)[0] = (ushort)in[i+0];
9830 (out+j)[1] = (ushort)in[i+1];
9831 (out+j)[2] = (ushort)in[i+2];
9832 (out+j)[3] = (ushort)in[i+3];
9844 ushort * restrict out = (ushort* restrict)_out;
9848 (out+j)[0] = (ushort)in[i+0];
9849 (out+j)[1] = (ushort)in[i+1];
9850 (out+j)[2] = (ushort)in[i+2];
9851 (out+j)[3] = (ushort)in[i+3];
9852 (out+j)[4] = (ushort)in[i+4];
9853 (out+j)[5] = (ushort)in[i+5];
9865 ushort * restrict out = (ushort* restrict)_out;
9871 (out+j)[0] = (ushort)in[i+0];
9872 (out+j)[1] = (ushort)in[i+1];
9873 (out+j)[2] = (ushort)in[i+2];
9874 (out+j)[3] = (ushort)in[i+3];
9875 (out+j)[4] = (ushort)in[i+4];
9876 (out+j)[5] = (ushort)in[i+5];
9879 (out+j)[0] = (ushort)in[i+2];
9880 (out+j)[1] = (ushort)in[i-2];
9881 (out+j)[2] = (ushort)in[i+0];
9882 (out+j)[3] = (ushort)in[i+3];
9883 (out+j)[4] = (ushort)in[i+4];
9884 (out+j)[5] = (ushort)in[i+6];
9897 uint * restrict out = (uint* restrict)_out;
9901 (out+j)[0] = (uint)in[i];
9913 uint * restrict out = (uint* restrict)_out;
9917 (out+j)[0] = (uint)in[i];
9918 (out+j)[1] = (uint)in[i+1];
9930 uint * restrict out = (uint* restrict)_out;
9934 (out+j)[0] = (uint)in[i];
9935 (out+j)[1] = (uint)in[i+1];
9947 uint * restrict out = (uint* restrict)_out;
9952 (out+j)[0] = (uint)in[i];
9953 (out+j)[1] = (uint)in[i+1];
9956 (out+j)[0] = (uint)in[end];
9957 (out+j)[1] = (uint)in[start];
9968 uint * restrict out = (uint* restrict)_out;
9972 (out+j)[0] = (uint)in[i];
9973 (out+j)[1] = (uint)in[i+1];
9974 (out+j)[2] = (uint)in[i+2];
9986 uint * restrict out = (uint* restrict)_out;
9990 (out+j)[0] = (uint)in[i];
9991 (out+j)[1] = (uint)in[i+1+(i&1)];
9992 (out+j)[2] = (uint)in[i+2-(i&1)];
10004 uint * restrict out = (uint* restrict)_out;
10008 (out+j)[0] = (uint)in[i+1];
10009 (out+j)[1] = (uint)in[i+2];
10010 (out+j)[2] = (uint)in[start];
10022 uint * restrict out = (uint* restrict)_out;
10026 (out+j+0)[0] = (uint)in[i+0];
10027 (out+j+0)[1] = (uint)in[i+1];
10028 (out+j+0)[2] = (uint)in[i+2];
10029 (out+j+3)[0] = (uint)in[i+0];
10030 (out+j+3)[1] = (uint)in[i+2];
10031 (out+j+3)[2] = (uint)in[i+3];
10043 uint * restrict out = (uint* restrict)_out;
10047 (out+j+0)[0] = (uint)in[i+0];
10048 (out+j+0)[1] = (uint)in[i+1];
10049 (out+j+0)[2] = (uint)in[i+3];
10050 (out+j+3)[0] = (uint)in[i+0];
10051 (out+j+3)[1] = (uint)in[i+3];
10052 (out+j+3)[2] = (uint)in[i+2];
10064 uint * restrict out = (uint* restrict)_out;
10068 (out+j)[0] = (uint)in[start];
10069 (out+j)[1] = (uint)in[i+1];
10070 (out+j)[2] = (uint)in[i+2];
10082 uint * restrict out = (uint* restrict)_out;
10086 (out+j)[0] = (uint)in[i+0];
10087 (out+j)[1] = (uint)in[i+1];
10088 (out+j)[2] = (uint)in[i+2];
10089 (out+j)[3] = (uint)in[i+3];
10101 uint * restrict out = (uint* restrict)_out;
10105 (out+j)[0] = (uint)in[i+0];
10106 (out+j)[1] = (uint)in[i+1];
10107 (out+j)[2] = (uint)in[i+2];
10108 out+j)[3] = (uint)in[i+3];
10120 uint * restrict out = (uint* restrict)_out;
10124 (out+j)[0] = (uint)in[i+0];
10125 (out+j)[1] = (uint)in[i+1];
10126 (out+j)[2] = (uint)in[i+2];
10127 (out+j)[3] = (uint)in[i+3];
10128 (out+j)[4] = (uint)in[i+4];
10129 (out+j)[5] = (uint)in[i+5];
10141 uint * restrict out = (uint* restrict)_out;
10147 (out+j)[0] = (uint)in[i+0];
10148 (out+j)[1] = (uint)in[i+1];
10149 (out+j)[2] = (uint)in[i+2];
10150 (out+j)[3] = (uint)in[i+3];
10151 (out+j)[4] = (uint)in[i+4];
10152 (out+j)[5] = (uint)in[i+5];
10155 (out+j)[0] = (uint)in[i+2];
10156 (out+j)[1] = (uint)in[i-2];
10157 (out+j)[2] = (uint)in[i+0];
10158 (out+j)[3] = (uint)in[i+3];
10159 (out+j)[4] = (uint)in[i+4];
10160 (out+j)[5] = (uint)in[i+6];
10173 uint * restrict out = (uint* restrict)_out;
10177 (out+j)[0] = (uint)in[i];
10189 uint * restrict out = (uint* restrict)_out;
10193 (out+j)[0] = (uint)in[i];
10194 (out+j)[1] = (uint)in[i+1];
10206 uint * restrict out = (uint* restrict)_out;
10210 (out+j)[0] = (uint)in[i];
10211 (out+j)[1] = (uint)in[i+1];
10223 uint * restrict out = (uint* restrict)_out;
10230 (out+j+0)[0] = restart_index;
10231 (out+j+0)[1] = restart_index;
10236 (out+j)[0] = (uint)in[end];
10237 (out+j)[1] = (uint)in[start];
10245 (out+j)[0] = (uint)in[end];
10246 (out+j)[1] = (uint)in[start];
10252 (out+j)[0] = (uint)in[i];
10253 (out+j)[1] = (uint)in[i+1];
10256 (out+j)[0] = (uint)in[end];
10257 (out+j)[1] = (uint)in[start];
10268 uint * restrict out = (uint* restrict)_out;
10272 (out+j)[0] = (uint)in[i];
10273 (out+j)[1] = (uint)in[i+1];
10274 (out+j)[2] = (uint)in[i+2];
10286 uint * restrict out = (uint* restrict)_out;
10290 (out+j)[0] = (uint)in[i];
10291 (out+j)[1] = (uint)in[i+1+(i&1)];
10292 (out+j)[2] = (uint)in[i+2-(i&1)];
10304 uint * restrict out = (uint* restrict)_out;
10310 (out+j+0)[0] = restart_index;
10311 (out+j+0)[1] = restart_index;
10312 (out+j+0)[2] = restart_index;
10330 (out+j)[0] = (uint)in[i+1];
10331 (out+j)[1] = (uint)in[i+2];
10332 (out+j)[2] = (uint)in[start];
10344 uint * restrict out = (uint* restrict)_out;
10350 (out+j+0)[0] = restart_index;
10351 (out+j+0)[1] = restart_index;
10352 (out+j+0)[2] = restart_index;
10353 (out+j+3)[0] = restart_index;
10354 (out+j+3)[1] = restart_index;
10355 (out+j+3)[2] = restart_index;
10374 (out+j+0)[0] = (uint)in[i+0];
10375 (out+j+0)[1] = (uint)in[i+1];
10376 (out+j+0)[2] = (uint)in[i+2];
10377 (out+j+3)[0] = (uint)in[i+0];
10378 (out+j+3)[1] = (uint)in[i+2];
10379 (out+j+3)[2] = (uint)in[i+3];
10391 uint * restrict out = (uint* restrict)_out;
10397 (out+j+0)[0] = restart_index;
10398 (out+j+0)[1] = restart_index;
10399 (out+j+0)[2] = restart_index;
10400 (out+j+3)[0] = restart_index;
10401 (out+j+3)[1] = restart_index;
10402 (out+j+3)[2] = restart_index;
10421 (out+j+0)[0] = (uint)in[i+0];
10422 (out+j+0)[1] = (uint)in[i+1];
10423 (out+j+0)[2] = (uint)in[i+3];
10424 (out+j+3)[0] = (uint)in[i+0];
10425 (out+j+3)[1] = (uint)in[i+3];
10426 (out+j+3)[2] = (uint)in[i+2];
10438 uint * restrict out = (uint* restrict)_out;
10444 (out+j+0)[0] = restart_index;
10445 (out+j+0)[1] = restart_index;
10446 (out+j+0)[2] = restart_index;
10464 (out+j)[0] = (uint)in[start];
10465 (out+j)[1] = (uint)in[i+1];
10466 (out+j)[2] = (uint)in[i+2];
10478 uint * restrict out = (uint* restrict)_out;
10482 (out+j)[0] = (uint)in[i+0];
10483 (out+j)[1] = (uint)in[i+1];
10484 (out+j)[2] = (uint)in[i+2];
10485 (out+j)[3] = (uint)in[i+3];
10497 uint * restrict out = (uint* restrict)_out;
10501 (out+j)[0] = (uint)in[i+0];
10502 (out+j)[1] = (uint)in[i+1];
10503 (out+j)[2] = (uint)in[i+2];
10504 (out+j)[3] = (uint)in[i+3];
10516 uint * restrict out = (uint* restrict)_out;
10520 (out+j)[0] = (uint)in[i+0];
10521 (out+j)[1] = (uint)in[i+1];
10522 (out+j)[2] = (uint)in[i+2];
10523 (out+j)[3] = (uint)in[i+3];
10524 (out+j)[4] = (uint)in[i+4];
10525 (out+j)[5] = (uint)in[i+5];
10537 uint * restrict out = (uint* restrict)_out;
10543 (out+j)[0] = (uint)in[i+0];
10544 (out+j)[1] = (uint)in[i+1];
10545 (out+j)[2] = (uint)in[i+2];
10546 (out+j)[3] = (uint)in[i+3];
10547 (out+j)[4] = (uint)in[i+4];
10548 (out+j)[5] = (uint)in[i+5];
10551 (out+j)[0] = (uint)in[i+2];
10552 (out+j)[1] = (uint)in[i-2];
10553 (out+j)[2] = (uint)in[i+0];
10554 (out+j)[3] = (uint)in[i+3];
10555 (out+j)[4] = (uint)in[i+4];
10556 (out+j)[5] = (uint)in[i+6];
10569 uint * restrict out = (uint* restrict)_out;
10573 (out+j)[0] = (uint)in[i];
10585 uint * restrict out = (uint* restrict)_out;
10589 (out+j)[0] = (uint)in[i+1];
10590 (out+j)[1] = (uint)in[i];
10602 uint * restrict out = (uint* restrict)_out;
10606 (out+j)[0] = (uint)in[i+1];
10607 (out+j)[1] = (uint)in[i];
10619 uint * restrict out = (uint* restrict)_out;
10624 (out+j)[0] = (uint)in[i+1];
10625 (out+j)[1] = (uint)in[i];
10628 (out+j)[0] = (uint)in[start];
10629 (out+j)[1] = (uint)in[end];
10640 uint * restrict out = (uint* restrict)_out;
10644 (out+j)[0] = (uint)in[i+1];
10645 (out+j)[1] = (uint)in[i+2];
10646 (out+j)[2] = (uint)in[i];
10658 uint * restrict out = (uint* restrict)_out;
10662 (out+j)[0] = (uint)in[i+1+(i&1)];
10663 (out+j)[1] = (uint)in[i+2-(i&1)];
10664 (out+j)[2] = (uint)in[i];
10676 uint * restrict out = (uint* restrict)_out;
10680 (out+j)[0] = (uint)in[i+2];
10681 (out+j)[1] = (uint)in[start];
10682 (out+j)[2] = (uint)in[i+1];
10694 uint * restrict out = (uint* restrict)_out;
10698 (out+j+0)[0] = (uint)in[i+1];
10699 (out+j+0)[1] = (uint)in[i+2];
10700 (out+j+0)[2] = (uint)in[i+0];
10701 (out+j+3)[0] = (uint)in[i+2];
10702 (out+j+3)[1] = (uint)in[i+3];
10703 (out+j+3)[2] = (uint)in[i+0];
10715 uint * restrict out = (uint* restrict)_out;
10719 (out+j+0)[0] = (uint)in[i+1];
10720 (out+j+0)[1] = (uint)in[i+3];
10721 (out+j+0)[2] = (uint)in[i+0];
10722 (out+j+3)[0] = (uint)in[i+3];
10723 (out+j+3)[1] = (uint)in[i+2];
10724 (out+j+3)[2] = (uint)in[i+0];
10736 uint * restrict out = (uint* restrict)_out;
10740 (out+j)[0] = (uint)in[i+1];
10741 (out+j)[1] = (uint)in[i+2];
10742 (out+j)[2] = (uint)in[start];
10754 uint * restrict out = (uint* restrict)_out;
10758 (out+j)[0] = (uint)in[i+3];
10759 (out+j)[1] = (uint)in[i+2];
10760 (out+j)[2] = (uint)in[i+1];
10761 (out+j)[3] = (uint)in[i+0];
10773 uint * restrict out = (uint* restrict)_out;
10777 (out+j)[0] = (uint)in[i+3];
10778 (out+j)[1] = (uint)in[i+2];
10779 (out+j)[2] = (uint)in[i+1];
10780 (out+j)[3] = (uint)in[i+0];
10792 uint * restrict out = (uint* restrict)_out;
10796 (out+j)[0] = (uint)in[i+4];
10797 (out+j)[1] = (uint)in[i+5];
10798 (out+j)[2] = (uint)in[i+0];
10799 (out+j)[3] = (uint)in[i+1];
10800 (out+j)[4] = (uint)in[i+2];
10801 (out+j)[5] = (uint)in[i+3];
10813 uint * restrict out = (uint* restrict)_out;
10819 (out+j)[0] = (uint)in[i+4];
10820 (out+j)[1] = (uint)in[i+5];
10821 (out+j)[2] = (uint)in[i+0];
10822 (out+j)[3] = (uint)in[i+1];
10823 (out+j)[4] = (uint)in[i+2];
10824 (out+j)[5] = (uint)in[i+3];
10827 (out+j)[0] = (uint)in[i+4];
10828 (out+j)[1] = (uint)in[i+6];
10829 (out+j)[2] = (uint)in[i+2];
10830 (out+j)[3] = (uint)in[i-2];
10831 (out+j)[4] = (uint)in[i+0];
10832 (out+j)[5] = (uint)in[i+3];
10845 uint * restrict out = (uint* restrict)_out;
10849 (out+j)[0] = (uint)in[i];
10861 uint * restrict out = (uint* restrict)_out;
10865 (out+j)[0] = (uint)in[i+1];
10866 (out+j)[1] = (uint)in[i];
10878 uint * restrict out = (uint* restrict)_out;
10882 (out+j)[0] = (uint)in[i+1];
10883 (out+j)[1] = (uint)in[i];
10895 uint * restrict out = (uint* restrict)_out;
10902 (out+j+0)[0] = restart_index;
10903 (out+j+0)[1] = restart_index;
10908 (out+j)[0] = (uint)in[start];
10909 (out+j)[1] = (uint)in[end];
10917 (out+j)[0] = (uint)in[start];
10918 (out+j)[1] = (uint)in[end];
10924 (out+j)[0] = (uint)in[i+1];
10925 (out+j)[1] = (uint)in[i];
10928 (out+j)[0] = (uint)in[start];
10929 (out+j)[1] = (uint)in[end];
10940 uint * restrict out = (uint* restrict)_out;
10944 (out+j)[0] = (uint)in[i+1];
10945 (out+j)[1] = (uint)in[i+2];
10946 (out+j)[2] = (uint)in[i];
10958 uint * restrict out = (uint* restrict)_out;
10962 (out+j)[0] = (uint)in[i+1+(i&1)];
10963 (out+j)[1] = (uint)in[i+2-(i&1)];
10964 (out+j)[2] = (uint)in[i];
10976 uint * restrict out = (uint* restrict)_out;
10982 (out+j+0)[0] = restart_index;
10983 (out+j+0)[1] = restart_index;
10984 (out+j+0)[2] = restart_index;
11002 (out+j)[0] = (uint)in[i+2];
11003 (out+j)[1] = (uint)in[start];
11004 (out+j)[2] = (uint)in[i+1];
11016 uint * restrict out = (uint* restrict)_out;
11022 (out+j+0)[0] = restart_index;
11023 (out+j+0)[1] = restart_index;
11024 (out+j+0)[2] = restart_index;
11025 (out+j+3)[0] = restart_index;
11026 (out+j+3)[1] = restart_index;
11027 (out+j+3)[2] = restart_index;
11046 (out+j+0)[0] = (uint)in[i+1];
11047 (out+j+0)[1] = (uint)in[i+2];
11048 (out+j+0)[2] = (uint)in[i+0];
11049 (out+j+3)[0] = (uint)in[i+2];
11050 (out+j+3)[1] = (uint)in[i+3];
11051 (out+j+3)[2] = (uint)in[i+0];
11063 uint * restrict out = (uint* restrict)_out;
11069 (out+j+0)[0] = restart_index;
11070 (out+j+0)[1] = restart_index;
11071 (out+j+0)[2] = restart_index;
11072 (out+j+3)[0] = restart_index;
11073 (out+j+3)[1] = restart_index;
11074 (out+j+3)[2] = restart_index;
11093 (out+j+0)[0] = (uint)in[i+1];
11094 (out+j+0)[1] = (uint)in[i+3];
11095 (out+j+0)[2] = (uint)in[i+0];
11096 (out+j+3)[0] = (uint)in[i+3];
11097 (out+j+3)[1] = (uint)in[i+2];
11098 (out+j+3)[2] = (uint)in[i+0];
11110 uint * restrict out = (uint* restrict)_out;
11116 (out+j+0)[0] = restart_index;
11117 (out+j+0)[1] = restart_index;
11118 (out+j+0)[2] = restart_index;
11136 (out+j)[0] = (uint)in[i+1];
11137 (out+j)[1] = (uint)in[i+2];
11138 (out+j)[2] = (uint)in[start];
11150 uint * restrict out = (uint* restrict)_out;
11154 (out+j)[0] = (uint)in[i+3];
11155 (out+j)[1] = (uint)in[i+2];
11156 (out+j)[2] = (uint)in[i+1];
11157 (out+j)[3] = (uint)in[i+0];
11169 uint * restrict out = (uint* restrict)_out;
11173 (out+j)[0] = (uint)in[i+3];
11174 (out+j)[1] = (uint)in[i+2];
11175 (out+j)[2] = (uint)in[i+1];
11176 (out+j)[3] = (uint)in[i+0];
11188 uint * restrict out = (uint* restrict)_out;
11192 (out+j)[0] = (uint)in[i+4];
11193 (out+j)[1] = (uint)in[i+5];
11194 (out+j)[2] = (uint)in[i+0];
11195 (out+j)[3] = (uint)in[i+1];
11196 (out+j)[4] = (uint)in[i+2];
11197 (out+j)[5] = (uint)in[i+3];
11209 uint * restrict out = (uint* restrict)_out;
11215 (out+j)[0] = (uint)in[i+4];
11216 (out+j)[1] = (uint)in[i+5];
11217 (out+j)[2] = (uint)in[i+0];
11218 (out+j)[3] = (uint)in[i+1];
11219 (out+j)[4] = (uint)in[i+2];
11220 (out+j)[5] = (uint)in[i+3];
11223 (out+j)[0] = (uint)in[i+4];
11224 (out+j)[1] = (uint)in[i+6];
11225 (out+j)[2] = (uint)in[i+2];
11226 (out+j)[3] = (uint)in[i-2];
11227 (out+j)[4] = (uint)in[i+0];
11228 (out+j)[5] = (uint)in[i+3];
11241 uint * restrict out = (uint* restrict)_out;
11245 (out+j)[0] = (uint)in[i];
11257 uint * restrict out = (uint* restrict)_out;
11261 (out+j)[0] = (uint)in[i+1];
11262 (out+j)[1] = (uint)in[i];
11274 uint * restrict out = (uint* restrict)_out;
11278 (out+j)[0] = (uint)in[i+1];
11279 (out+j)[1] = (uint)in[i];
11291 uint * restrict out = (uint* restrict)_out;
11296 (out+j)[0] = (uint)in[i+1];
11297 (out+j)[1] = (uint)in[i];
11300 (out+j)[0] = (uint)in[start];
11301 (out
11312 uint * restrict out = (uint* restrict)_out;
11316 (out+j)[0] = (uint)in[i+2];
11317 (out+j)[1] = (uint)in[i];
11318 (out+j)[2] = (uint)in[i+1];
11330 uint * restrict out = (uint* restrict)_out;
11334 (out+j)[0] = (uint)in[i+2];
11335 (out+j)[1] = (uint)in[i+(i&1)];
11336 (out+j)[2] = (uint)in[i+1-(i&1)];
11348 uint * restrict out = (uint* restrict)_out;
11352 (out+j)[0] = (uint)in[i+2];
11353 (out+j)[1] = (uint)in[start];
11354 (out+j)[2] = (uint)in[i+1];
11366 uint * restrict out = (uint* restrict)_out;
11370 (out+j+0)[0] = (uint)in[i+3];
11371 (out+j+0)[1] = (uint)in[i+0];
11372 (out+j+0)[2] = (uint)in[i+1];
11373 (out+j+3)[0] = (uint)in[i+3];
11374 (out+j+3)[1] = (uint)in[i+1];
11375 (out+j+3)[2] = (uint)in[i+2];
11387 uint * restrict out = (uint* restrict)_out;
11391 (out+j+0)[0] = (uint)in[i+3];
11392 (out+j+0)[1] = (uint)in[i+2];
11393 (out+j+0)[2] = (uint)in[i+0];
11394 (out+j+3)[0] = (uint)in[i+3];
11395 (out+j+3)[1] = (uint)in[i+0];
11396 (out+j+3)[2] = (uint)in[i+1];
11408 uint * restrict out = (uint* restrict)_out;
11412 (out+j)[0] = (uint)in[start];
11413 (out+j)[1] = (uint)in[i+1];
11414 (out+j)[2] = (uint)in[i+2];
11426 uint * restrict out = (uint* restrict)_out;
11430 (out+j)[0] = (uint)in[i+3];
11431 (out+j)[1] = (uint)in[i+2];
11432 (out+j)[2] = (uint)in[i+1];
11433 (out+j)[3] = (uint)in[i+0];
11445 uint * restrict out = (uint* restrict)_out;
11449 (out+j)[0] = (uint)in[i+3];
11450 (out+j)[1] = (uint)in[i+2];
11451 (out+j)[2] = (uint)in[i+1];
11452 (out+j)[3] = (uint)in[i+0];
11464 uint * restrict out = (uint* restrict)_out;
11468 (out+j)[0] = (uint)in[i+4];
11469 (out+j)[1] = (uint)in[i+5];
11470 (out+j)[2] = (uint)in[i+0];
11471 (out+j)[3] = (uint)in[i+1];
11472 (out+j)[4] = (uint)in[i+2];
11473 (out+j)[5] = (uint)in[i+3];
11485 uint * restrict out = (uint* restrict)_out;
11491 (out+j)[0] = (uint)in[i+4];
11492 (out+j)[1] = (uint)in[i+5];
11493 (out+j)[2] = (uint)in[i+0];
11494 (out+j)[3] = (uint)in[i+1];
11495 (out+j)[4] = (uint)in[i+2];
11496 (out+j)[5] = (uint)in[i+3];
11499 (out+j)[0] = (uint)in[i+4];
11500 (out+j)[1] = (uint)in[i+6];
11501 (out+j)[2] = (uint)in[i+2];
11502 (out+j)[3] = (uint)in[i-2];
11503 (out+j)[4] = (uint)in[i+0];
11504 (out+j)[5] = (uint)in[i+3];
11517 uint * restrict out = (uint* restrict)_out;
11521 (out+j)[0] = (uint)in[i];
11533 uint * restrict out = (uint* restrict)_out;
11537 (out+j)[0] = (uint)in[i+1];
11538 (out+j)[1] = (uint)in[i];
11550 uint * restrict out = (uint* restrict)_out;
11554 (out+j)[0] = (uint)in[i+1];
11555 (out+j)[1] = (uint)in[i];
11567 uint * restrict out = (uint* restrict)_out;
11574 (out+j+0)[0] = restart_index;
11575 (out+j+0)[1] = restart_index;
11580 (out+j)[0] = (uint)in[start];
11581 (out+j)[1] = (uint)in[end];
11589 (out+j)[0] = (uint)in[start];
11590 (out+j)[1] = (uint)in[end];
11596 (out+j)[0] = (uint)in[i+1];
11597 (out+j)[1] = (uint)in[i];
11600 (out+j)[0] = (uint)in[start];
11601 (out+j)[1] = (uint)in[end];
11612 uint * restrict out = (uint* restrict)_out;
11616 (out+j)[0] = (uint)in[i+2];
11617 (out+j)[1] = (uint)in[i];
11618 (out+j)[2] = (uint)in[i+1];
11630 uint * restrict out = (uint* restrict)_out;
11634 (out+j)[0] = (uint)in[i+2];
11635 (out+j)[1] = (uint)in[i+(i&1)];
11636 (out+j)[2] = (uint)in[i+1-(i&1)];
11648 uint * restrict out = (uint* restrict)_out;
11654 (out+j+0)[0] = restart_index;
11655 (out+j+0)[1] = restart_index;
11656 (out+j+0)[2] = restart_index;
11674 (out+j)[0] = (uint)in[i+2];
11675 (out+j)[1] = (uint)in[start];
11676 (out+j)[2] = (uint)in[i+1];
11688 uint * restrict out = (uint* restrict)_out;
11694 (out+j+0)[0] = restart_index;
11695 (out+j+0)[1] = restart_index;
11696 (out+j+0)[2] = restart_index;
11697 (out+j+3)[0] = restart_index;
11698 (out+j+3)[1] = restart_index;
11699 (out+j+3)[2] = restart_index;
11718 (out+j+0)[0] = (uint)in[i+3];
11719 (out+j+0)[1] = (uint)in[i+0];
11720 (out+j+0)[2] = (uint)in[i+1];
11721 (out+j+3)[0] = (uint)in[i+3];
11722 (out+j+3)[1] = (uint)in[i+1];
11723 (out+j+3)[2] = (uint)in[i+2];
11735 uint * restrict out = (uint* restrict)_out;
11741 (out+j+0)[0] = restart_index;
11742 (out+j+0)[1] = restart_index;
11743 (out+j+0)[2] = restart_index;
11744 (out+j+3)[0] = restart_index;
11745 (out+j+3)[1] = restart_index;
11746 (out+j+3)[2] = restart_index;
11765 (out+j+0)[0] = (uint)in[i+3];
11766 (out+j+0)[1] = (uint)in[i+2];
11767 (out+j+0)[2] = (uint)in[i+0];
11768 (out+j+3)[0] = (uint)in[i+3];
11769 (out+j+3)[1] = (uint)in[i+0];
11770 (out+j+3)[2] = (uint)in[i+1];
11782 uint * restrict out = (uint* restrict)_out;
11788 (out+j+0)[0] = restart_index;
11789 (out+j+0)[1] = restart_index;
11790 (out+j+0)[2] = restart_index;
11808 (out+j)[0] = (uint)in[start];
11809 (out+j)[1] = (uint)in[i+1];
11810 (out+j)[2] = (uint)in[i+2];
11822 uint * restrict out = (uint* restrict)_out;
11826 (out+j)[0] = (uint)in[i+3];
11827 (out+j)[1] = (uint)in[i+2];
11828 (out+j)[2] = (uint)in[i+1];
11829 (out+j)[3] = (uint)in[i+0];
11841 uint * restrict out = (uint* restrict)_out;
11845 (out+j)[0] = (uint)in[i+3];
11846 (out+j)[1] = (uint)in[i+2];
11847 (out+j)[2] = (uint)in[i+1];
11848 (out+j)[3] = (uint)in[i+0];
11860 uint * restrict out = (uint* restrict)_out;
11864 (out+j)[0] = (uint)in[i+4];
11865 (out+j)[1] = (uint)in[i+5];
11866 (out+j)[2] = (uint)in[i+0];
11867 (out+j)[3] = (uint)in[i+1];
11868 (out+j)[4] = (uint)in[i+2];
11869 (out+j)[5] = (uint)in[i+3];
11881 uint * restrict out = (uint* restrict)_out;
11887 (out+j)[0] = (uint)in[i+4];
11888 (out+j)[1] = (uint)in[i+5];
11889 (out+j)[2] = (uint)in[i+0];
11890 (out+j)[3] = (uint)in[i+1];
11891 (out+j)[4] = (uint)in[i+2];
11892 (out+j)[5] = (uint)in[i+3];
11895 (out
11896 (out+j)[1] = (uint)in[i+6];
11897 (out+j)[2] = (uint)in[i+2];
11898 (out+j)[3] = (uint)in[i-2];
11899 (out+j)[4] = (uint)in[i+0];
11900 (out+j)[5] = (uint)in[i+3];
11913 uint * restrict out = (uint* restrict)_out;
11917 (out+j)[0] = (uint)in[i];
11929 uint * restrict out = (uint* restrict)_out;
11933 (out+j)[0] = (uint)in[i];
11934 (out+j)[1] = (uint)in[i+1];
11946 uint * restrict out = (uint* restrict)_out;
11950 (out+j)[0] = (uint)in[i];
11951 (out+j)[1] = (uint)in[i+1];
11963 uint * restrict out = (uint* restrict)_out;
11968 (out+j)[0] = (uint)in[i];
11969 (out+j)[1] = (uint)in[i+1];
11972 (out+j)[0] = (uint)in[end];
11973 (out+j)[1] = (uint)in[start];
11984 uint * restrict out = (uint* restrict)_out;
11988 (out+j)[0] = (uint)in[i];
11989 (out+j)[1] = (uint)in[i+1];
11990 (out+j)[2] = (uint)in[i+2];
12002 uint * restrict out = (uint* restrict)_out;
12006 (out+j)[0] = (uint)in[i+(i&1)];
12007 (out+j)[1] = (uint)in[i+1-(i&1)];
12008 (out+j)[2] = (uint)in[i+2];
12020 uint * restrict out = (uint* restrict)_out;
12024 (out+j)[0] = (uint)in[start];
12025 (out+j)[1] = (uint)in[i+1];
12026 (out+j)[2] = (uint)in[i+2];
12038 uint * restrict out = (uint* restrict)_out;
12042 (out+j+0)[0] = (uint)in[i+0];
12043 (out+j+0)[1] = (uint)in[i+1];
12044 (out+j+0)[2] = (uint)in[i+3];
12045 (out+j+3)[0] = (uint)in[i+1];
12046 (out+j+3)[1] = (uint)in[i+2];
12047 (out+j+3)[2] = (uint)in[i+3];
12059 uint * restrict out = (uint* restrict)_out;
12063 (out+j+0)[0] = (uint)in[i+2];
12064 (out+j+0)[1] = (uint)in[i+0];
12065 (out+j+0)[2] = (uint)in[i+3];
12066 (out+j+3)[0] = (uint)in[i+0];
12067 (out+j+3)[1] = (uint)in[i+1];
12068 (out+j+3)[2] = (uint)in[i+3];
12080 uint * restrict out = (uint* restrict)_out;
12084 (out+j)[0] = (uint)in[i+1];
12085 (out+j)[1] = (uint)in[i+2];
12086 (out+j)[2] = (uint)in[start];
12098 uint * restrict out = (uint* restrict)_out;
12102 (out+j)[0] = (uint)in[i+0];
12103 (out+j)[1] = (uint)in[i+1];
12104 (out+j)[2] = (uint)in[i+2];
12105 (out+j)[3] = (uint)in[i+3];
12117 uint * restrict out = (uint* restrict)_out;
12121 (out+j)[0] = (uint)in[i+0];
12122 (out+j)[1] = (uint)in[i+1];
12123 (out+j)[2] = (uint)in[i+2];
12124 (out+j)[3] = (uint)in[i+3];
12136 uint * restrict out = (uint* restrict)_out;
12140 (out+j)[0] = (uint)in[i+0];
12141 (out+j)[1] = (uint)in[i+1];
12142 (out+j)[2] = (uint)in[i+2];
12143 (out+j)[3] = (uint)in[i+3];
12144 (out+j)[4] = (uint)in[i+4];
12145 (out+j)[5] = (uint)in[i+5];
12157 uint * restrict out = (uint* restrict)_out;
12163 (out+j)[0] = (uint)in[i+0];
12164 (out+j)[1] = (uint)in[i+1];
12165 (out+j)[2] = (uint)in[i+2];
12166 (out+j)[3] = (uint)in[i+3];
12167 (out+j)[4] = (uint)in[i+4];
12168 (out+j)[5] = (uint)in[i+5];
12171 (out+j)[0] = (uint)in[i+2];
12172 (out+j)[1] = (uint)in[i-2];
12173 (out+j)[2] = (uint)in[i+0];
12174 (out+j)[3] = (uint)in[i+3];
12175 (out+j)[4] = (uint)in[i+4];
12176 (out+j)[5] = (uint)in[i+6];
12189 uint * restrict out = (uint* restrict)_out;
12193 (out+j)[0] = (uint)in[i];
12205 uint * restrict out = (uint* restrict)_out;
12209 (out+j)[0] = (uint)in[i];
12210 (out+j)[1] = (uint)in[i+1];
12222 uint * restrict out = (uint* restrict)_out;
12226 (out+j)[0] = (uint)in[i];
12227 (out+j)[1] = (uint)in[i+1];
12239 uint * restrict out = (uint* restrict)_out;
12246 (out+j+0)[0] = restart_index;
12247 (out+j+0)[1] = restart_index;
12252 (out+j)[0] = (uint)in[end];
12253 (out+j)[1] = (uint)in[start];
12261 (out+j)[0] = (uint)in[end];
12262 (out+j)[1] = (uint)in[start];
12268 (out+j)[0] = (uint)in[i];
12269 (out+j)[1] = (uint)in[i+1];
12272 (out+j)[0] = (uint)in[end];
12273 (out+j)[1] = (uint)in[start];
12284 uint * restrict out = (uint* restrict)_out;
12288 (out+j)[0] = (uint)in[i];
12289 (out+j)[1] = (uint)in[i+1];
12290 (out+j)[2] = (uint)in[i+2];
12302 uint * restrict out = (uint* restrict)_out;
12306 (out+j)[0] = (uint)in[i+(i&1)];
12307 (out+j)[1] = (uint)in[i+1-(i&1)];
12308 (out+j)[2] = (uint)in[i+2];
12320 uint * restrict out = (uint* restrict)_out;
12326 (out+j+0)[0] = restart_index;
12327 (out+j+0)[1] = restart_index;
12328 (out+j+0)[2] = restart_index;
12346 (out+j)[0] = (uint)in[start];
12347 (out+j)[1] = (uint)in[i+1];
12348 (out+j)[2] = (uint)in[i+2];
12360 uint * restrict out = (uint* restrict)_out;
12366 (out+j+0)[0] = restart_index;
12367 (out+j+0)[1] = restart_index;
12368 (out+j+0)[2] = restart_index;
12369 (out+j+3)[0] = restart_index;
12370 (out+j+3)[1] = restart_index;
12371 (out+j+3)[2] = restart_index;
12390 (out+j+0)[0] = (uint)in[i+0];
12391 (out+j+0)[1] = (uint)in[i+1];
12392 (out+j+0)[2] = (uint)in[i+3];
12393 (out+j+3)[0] = (uint)in[i+1];
12394 (out+j+3)[1] = (uint)in[i+2];
12395 (out+j+3)[2] = (uint)in[i+3];
12407 uint * restrict out = (uint* restrict)_out;
12413 (out+j+0)[0] = restart_index;
12414 (out+j+0)[1] = restart_index;
12415 (out+j+0)[2] = restart_index;
12416 (out+j+3)[0] = restart_index;
12417 (out+j+3)[1] = restart_index;
12418 (out+j+3)[2] = restart_index;
12437 (out+j+0)[0] = (uint)in[i+2];
12438 (out+j+0)[1] = (uint)in[i+0];
12439 (out+j+0)[2] = (uint)in[i+3];
12440 (out+j+3)[0] = (uint)in[i+0];
12441 (out+j+3)[1] = (uint)in[i+1];
12442 (out+j+3)[2] = (uint)in[i+3];
12454 uint * restrict out = (uint* restrict)_out;
12460 (out+j+0)[0] = restart_index;
12461 (out+j+0)[1] = restart_index;
12462 (out+j+0)[2] = restart_index;
12480 (out+j)[0] = (uint)in[i+1];
12481 (out+j)[1] = (uint)in[i+2];
12482 (out+j)[2] = (uint)in[start];
12494 uint * restrict out = (uint* restrict)_out;
12498 (out+j)[0] = (uint)in[i+0];
12499 (out+j)[1] = (uint)in[i+1];
12500 (out+j)[2] = (uint)in[i+2];
12501 (out+j)[3] = (uint)in[i+3];
12513 uint * restrict out = (uint* restrict)_out;
12517 (out+j)[0] = (uint)in[i+0];
12518 (out+j)[1] = (uint)in[i+1];
12519 (out+j)[2] = (uint)in[i+2];
12520 (out+j)[3] = (uint)in[i+3];
12532 uint * restrict out = (uint* restrict)_out;
12536 (out+j)[0] = (uint)in[i+0];
12537 (out+j)[1] = (uint)in[i+1];
12538 (out+j)[2] = (uint)in[i+2];
12539 (out+j)[3] = (uint)in[i+3];
12540 (out+j)[4] = (uint)in[i+4];
12541 (out+j)[5] = (uint)in[i+5];
12553 uint * restrict out = (uint* restrict)_out;
12559 (out+j)[0] = (uint)in[i+0];
12560 (out+j)[1] = (uint)in[i+1];
12561 (out+j)[2] = (uint)in[i+2];
12562 (out+j)[3] = (uint)in[i+3];
12563 (out+j)[4] = (uint)in[i+4];
12564 (out+j)[5] = (uint)in[i+5];
12567 (out+j)[0] = (uint)in[i+2];
12568 (out+j)[1] = (uint)in[i-2];
12569 (out+j)[2] = (uint)in[i+0];
12570 (out+j)[3] = (uint)in[i+3];
12571 (out+j)[4] = (uint)in[i+4];
12572 (out+j)[5] = (uint)in[i+6];
12585 ushort * restrict out = (ushort* restrict)_out;
12589 (out+j)[0] = (ushort)in[i];
12601 ushort * restrict out = (ushort* restrict)_out;
12605 (out+j)[0] = (ushort)in[i];
12606 (out+j)[1] = (ushort)in[i+1];
12618 ushort * restrict out = (ushort* restrict)_out;
12622 (out+j)[0] = (ushort)in[i];
12623 (out+j)[1] = (ushort)in[i+1];
12635 ushort * restrict out = (ushort* restrict)_out;
12640 (out+j)[0] = (ushort)in[i];
12641 (out+j)[1] = (ushort)in[i+1];
12644 (out+j)[0] = (ushort)in[end];
12645 (out+j)[1] = (ushort)in[start];
12656 ushort * restrict out = (ushort* restrict)_out;
12660 (out+j)[0] = (ushort)in[i];
12661 (out+j)[1] = (ushort)in[i+1];
12662 (out+j)[2] = (ushort)in[i+2];
12674 ushort * restrict out = (ushort* restrict)_out;
12678 (out+j)[0] = (ushort)in[i];
12679 (out+j)[1] = (ushort)in[i+1+(i&1)];
12680 (out+j)[2] = (ushort)in[i+2-(i&1)];
12692 ushort * restrict out = (ushort* restrict)_out;
12696 (out+j)[0] = (ushort)in[i+1];
12697 (out+j)[1] = (ushort)in[i+2];
12698 (out+j)[2] = (ushort)in[start];
12710 ushort * restrict out = (ushort* restrict)_out;
12714 (out+j+0)[0] = (ushort)in[i+0];
12715 (out+j+0)[1] = (ushort)in[i+1];
12716 (out+j+0)[2] = (ushort)in[i+2];
12717 (out+j+3)[0] = (ushort)in[i+0];
12718 (out+j+3)[1] = (ushort)in[i+2];
12719 (out+j+3)[2] = (ushort)in[i+3];
12731 ushort * restrict out = (ushort* restrict)_out;
12735 (out+j+0)[0] = (ushort)in[i+0];
12736 (out+j+0)[1] = (ushort)in[i+1];
12737 (out+j+0)[2] = (ushort)in[i+3];
12738 (out+j+3)[0] = (ushort)in[i+0];
12739 (out+j+3)[1] = (ushort)in[i+3];
12740 (out+j+3)[2] = (ushort)in[i+2];
12752 ushort * restrict out = (ushort* restrict)_out;
12756 (out+j)[0] = (ushort)in[start];
12757 (out+j)[1] = (ushort)in[i+1];
12758 (out+j)[2] = (ushort)in[i+2];
12770 ushort * restrict out = (ushort* restrict)_out;
12774 (out+j)[0] = (ushort)in[i+0];
12775 (out+j)[1] = (ushort)in[i+1];
12776 (out+j)[2] = (ushort)in[i+2];
12777 (out+j)[3] = (ushort)in[i+3];
12789 ushort * restrict out = (ushort* restrict)_out;
12793 (out+j)[0] = (ushort)in[i+0];
12794 (out+j)[1] = (ushort)in[i+1];
12795 (out+j)[2] = (ushort)in[i+2];
12796 (out+j)[3] = (ushort)in[i+3];
12808 ushort * restrict out = (ushort* restrict)_out;
12812 (out+j)[0] = (ushort)in[i+0];
12813 (out+j)[1] = (ushort)in[i+1];
12814 (out+j)[2] = (ushort)in[i+2];
12815 (out+j)[3] = (ushort)in[i+3];
12816 (out+j)[4] = (ushort)in[i+4];
12817 (out+j)[5] = (ushort)in[i+5];
12829 ushort * restrict out = (ushort* restrict)_out;
12835 (out+j)[0] = (ushort)in[i+0];
12836 (out+j)[1] = (ushort)in[i+1];
12837 (out+j)[2] = (ushort)in[i+2];
12838 (out+j)[3] = (ushort)in[i+3];
12839 (out+j)[4] = (ushort)in[i+4];
12840 (out+j)[5] = (ushort)in[i+5];
12843 (out+j)[0] = (ushort)in[i+2];
12844 (out+j)[1] = (ushort)in[i-2];
12845 (out+j)[2] = (ushort)in[i+0];
12846 (out+j)[3] = (ushort)in[i+3];
12847 (out+j)[4] = (ushort)in[i+4];
12848 (out+j)[5] = (ushort)in[i+6];
12861 ushort * restrict out = (ushort* restrict)_out;
12865 (out+j)[0] = (ushort)in[i];
12877 ushort * restrict out = (ushort* restrict)_out;
12881 (out+j)[0] = (ushort)in[i];
12882 (out+j)[1] = (ushort)in[i+1];
12894 ushort * restrict out = (ushort* restrict)_out;
12898 (out+j)[0] = (ushort)in[i];
12899 (out+j)[1] = (ushort)in[i+1];
12911 ushort * restrict out = (ushort* restrict)_out;
12918 (out+j+0)[0] = restart_index;
12919 (out+j+0)[1] = restart_index;
12924 (out+j)[0] = (ushort)in[end];
12925 (out+j)[1] = (ushort)in[start];
12933 (out+j)[0] = (ushort)in[end];
12934 (out+j)[1] = (ushort)in[start];
12940 (out+j)[0] = (ushort)in[i];
12941 (out+j)[1] = (ushort)in[i+1];
12944 (out+j)[0] = (ushort)in[end];
12945 (out+j)[1] = (ushort)in[start];
12956 ushort * restrict out = (ushort* restrict)_out;
12960 (out+j)[0] = (ushort)in[i];
12961 (out+j)[1] = (ushort)in[i+1];
12962 (out+j)[2] = (ushort)in[i+2];
12974 ushort * restrict out = (ushort* restrict)_out;
12978 (out+j)[0] = (ushort)in[i];
12979 (out+j)[1] = (ushort)in[i+1+(i&1)];
12980 (out+j)[2] = (ushort)in[i+2-(i&1)];
12992 ushort * restrict out = (ushort* restrict)_out;
12998 (out+j+0)[0] = restart_index;
12999 (out+j+0)[1] = restart_index;
13000 (out+j+0)[2] = restart_index;
13018 (out+j)[0] = (ushort)in[i+1];
13019 (out+j)[1] = (ushort)in[i+2];
13020 (out+j)[2] = (ushort)in[start];
13032 ushort * restrict out = (ushort* restrict)_out;
13038 (out+j+0)[0] = restart_index;
13039 (out+j+0)[1] = restart_index;
13040 (out+j+0)[2] = restart_index;
13041 (out+j+3)[0] = restart_index;
13042 (out+j+3)[1] = restart_index;
13043 (out+j+3)[2] = restart_index;
13062 (out+j+0)[0] = (ushort)in[i+0];
13063 (out+j+0)[1] = (ushort)in[i+1];
13064 (out+j+0)[2] = (ushort)in[i+2];
13065 (out+j+3)[0] = (ushort)in[i+0];
13066 (out+j+3)[1] = (ushort)in[i+2];
13067 (out+j+3)[2] = (ushort)in[i+3];
13079 ushort * restrict out = (ushort* restrict)_out;
13085 (out+j+0)[0] = restart_index;
13086 (out+j+0)[1] = restart_index;
13087 (out+j+0)[2] = restart_index;
13088 (out+j+3)[0] = restart_index;
13089 (out+j+3)[1] = restart_index;
13090 (out+j+3)[2] = restart_index;
13109 (out+j+0)[0] = (ushort)in[i+0];
13110 (out+j+0)[1] = (ushort)in[i+1];
13111 (out+j+0)[2] = (ushort)in[i+3];
13112 (out+j+3)[0] = (ushort)in[i+0];
13113 (out+j+3)[1] = (ushort)in[i+3];
13114 (out+j+3)[2] = (ushort)in[i+2];
13126 ushort * restrict out = (ushort* restrict)_out;
13132 (out+j+0)[0] = restart_index;
13133 (out+j+0)[1] = restart_index;
13134 (out+j+0)[2] = restart_index;
13152 (out+j)[0] = (ushort)in[start];
13153 (out+j)[1] = (ushort)in[i+1];
13154 (out+j)[2] = (ushort)in[i+2];
13166 ushort * restrict out = (ushort* restrict)_out;
13170 (out+j)[0] = (ushort)in[i+0];
13171 (out+j)[1] = (ushort)in[i+1];
13172 (out+j)[2] = (ushort)in[i+2];
13173 (out+j)[3] = (ushort)in[i+3];
13185 ushort * restrict out = (ushort* restrict)_out;
13189 (out+j)[0] = (ushort)in[i+0];
13190 (out+j)[1] = (ushort)in[i+1];
13191 (out+j)[2] = (ushort)in[i+2];
13192 (out+j)[3] = (ushort)in[i+3];
13204 ushort * restrict out = (ushort* restrict)_out;
13208 (out+j)[0] = (ushort)in[i+0];
13209 (out+j)[1] = (ushort)in[i+1];
13210 (out+j)[2] = (ushort)in[i+2];
13211 (out+j)[3] = (ushort)in[i+3];
13212 (out+j)[4] = (ushort)in[i+4];
13213 (out+j)[5] = (ushort)in[i+5];
13225 ushort * restrict out = (ushort* restrict)_out;
13231 (out+j)[0] = (ushort)in[i+0];
13232 (out+j)[1] = (ushort)in[i+1];
13233 (out+j)[2] = (ushort)in[i+2];
13234 (out+j)[3] = (ushort)in[i+3];
13235 (out+j)[4] = (ushort)in[i+4];
13236 (out+j)[5] = (ushort)in[i+5];
13239 (out+j)[0] = (ushort)in[i+2];
13240 (out+j)[1] = (ushort)in[i-2];
13241 (out+j)[2] = (ushort)in[i+0];
13242 (out+j)[3] = (ushort)in[i+3];
13243 (out+j)[4] = (ushort)in[i+4];
13244 (out+j)[5] = (ushort)in[i+6];
13257 ushort * restrict out = (ushort* restrict)_out;
13261 (out+j)[0] = (ushort)in[i];
13273 ushort * restrict out = (ushort* restrict)_out;
13277 (out+j)[0] = (ushort)in[i+1];
13278 (out+j)[1] = (ushort)in[i];
13290 ushort * restrict out = (ushort* restrict)_out;
13294 (out+j)[0] = (ushort)in[i+1];
13295 (out+j)[1] = (ushort)in[i];
13307 ushort * restrict out = (ushort* restrict)_out;
13312 (out+j)[0] = (ushort)in[i+1];
13313 (out+j)[1] = (ushort)in[i];
13316 (out+j)[0] = (ushort)in[start];
13317 (out+j)[1] = (ushort)in[end];
13328 ushort * restrict out = (ushort* restrict)_out;
13332 (out+j)[0] = (ushort)in[i+1];
13333 (out+j)[1] = (ushort)in[i+2];
13334 (out+j)[2] = (ushort)in[i];
13346 ushort * restrict out = (ushort* restrict)_out;
13350 (out+j)[0] = (ushort)in[i+1+(i&1)];
13351 (out+j)[1] = (ushort)in[i+2-(i&1)];
13352 (out+j)[2] = (ushort)in[i];
13364 ushort * restrict out = (ushort* restrict)_out;
13368 (out+j)[0] = (ushort)in[i+2];
13369 (out+j)[1] = (ushort)in[start];
13370 (out+j)[2] = (ushort)in[i+1];
13382 ushort * restrict out = (ushort* restrict)_out;
13386 (out+j+0)[0] = (ushort)in[i+1];
13387 (out+j+0)[1] = (ushort)in[i+2];
13388 (out+j+0)[2] = (ushort)in[i+0];
13389 (out+j+3)[0] = (ushort)in[i+2];
13390 (out+j+3)[1] = (ushort)in[i+3];
13391 (out+j+3)[2] = (ushort)in[i+0];
13403 ushort * restrict out = (ushort* restrict)_out;
13407 (out+j+0)[0] = (ushort)in[i+1];
13408 (out+j+0)[1] = (ushort)in[i+3];
13409 (out+j+0)[2] = (ushort)in[i+0];
13410 (out+j+3)[0] = (ushort)in[i+3];
13411 (out+j+3)[1] = (ushort)in[i+2];
13412 (out+j+3)[2] = (ushort)in[i+0];
13424 ushort * restrict out = (ushort* restrict)_out;
13428 (out+j)[0] = (ushort)in[i+1];
13429 (out+j)[1] = (ushort)in[i+2];
13430 (out+j)[2] = (ushort)in[start];
13442 ushort * restrict out = (ushort* restrict)_out;
13446 (out+j)[0] = (ushort)in[i+3];
13447 (out+j)[1] = (ushort)in[i+2];
13448 (out+j)[2] = (ushort)in[i+1];
13449 (out+j)[3] = (ushort)in[i+0];
13461 ushort * restrict out = (ushort* restrict)_out;
13465 (out+j)[0] = (ushort)in[i+3];
13466 (out+j)[1] = (ushort)in[i+2];
13467 (out+j)[2] = (ushort)in[i+1];
13468 (out+j)[3] = (ushort)in[i+0];
13480 ushort * restrict out = (ushort* restrict)_out;
13484 (out+j)[0] = (ushort)in[i+4];
13485 (out+j)[1] = (ushort)in[i+5];
13486 (out+j)[2] = (ushort)in[i+0];
13487 (out+j)[3] = (ushort)in[i+1];
13488 (out+j)[4] = (ushort)in[i+2];
13489 (out+j)[5] = (ushort)in[i+3];
13501 ushort * restrict out = (ushort* restrict)_out;
13507 (out+j)[0] = (ushort)in[i+4];
13508 (out+j)[1] = (ushort)in[i+5];
13509 (out+j)[2] = (ushort)in[i+0];
13510 (out+j)[3] = (ushort)in[i+1];
13511 (out+j)[4] = (ushort)in[i+2];
13512 (out+j)[5] = (ushort)in[i+3];
13515 (out+j)[0] = (ushort)in[i+4];
13516 (out+j)[1] = (ushort)in[i+6];
13517 (out+j)[2] = (ushort)in[i+2];
13518 (out+j)[3] = (ushort)in[i-2];
13519 (out+j)[4] = (ushort)in[i+0];
13520 (out+j)[5] = (ushort)in[i+3];
13533 ushort * restrict out = (ushort* restrict)_out;
13537 (out+j)[0] = (ushort)in[i];
13549 ushort * restrict out = (ushort* restrict)_out;
13553 (out+j)[0] = (ushort)in[i+1];
13554 (out+j)[1] = (ushort)in[i];
13566 ushort * restrict out = (ushort* restrict)_out;
13570 (out+j)[0] = (ushort)in[i+1];
13571 (out+j)[1] = (ushort)in[i];
13583 ushort * restrict out = (ushort* restrict)_out;
13590 (out+j+0)[0] = restart_index;
13591 (out+j+0)[1] = restart_index;
13596 (out+j)[0] = (ushort)in[start];
13597 (out+j)[1] = (ushort)in[end];
13605 (out+j)[0] = (ushort)in[start];
13606 (out+j)[1] = (ushort)in[end];
13612 (out+j)[0] = (ushort)in[i+1];
13613 (out+j)[1] = (ushort)in[i];
13616 (out+j)[0] = (ushort)in[start];
13617 (out+j)[1] = (ushort)in[end];
13628 ushort * restrict out = (ushort* restrict)_out;
13632 (out+j)[0] = (ushort)in[i+1];
13633 (out+j)[1] = (ushort)in[i+2];
13634 (out+j)[2] = (ushort)in[i];
13646 ushort * restrict out = (ushort* restrict)_out;
13650 (out+j)[0] = (ushort)in[i+1+(i&1)];
13651 (out+j)[1] = (ushort)in[i+2-(i&1)];
13652 (out+j)[2] = (ushort)in[i];
13664 ushort * restrict out = (ushort* restrict)_out;
13670 (out+j+0)[0] = restart_index;
13671 (out+j+0)[1] = restart_index;
13672 (out+j+0)[2] = restart_index;
13690 (out+j)[0] = (ushort)in[i+2];
13691 (out+j)[1] = (ushort)in[start];
13692 (out+j)[2] = (ushort)in[i+1];
13704 ushort * restrict out = (ushort* restrict)_out;
13710 (out+j+0)[0] = restart_index;
13711 (out+j+0)[1] = restart_index;
13712 (out+j+0)[2] = restart_index;
13713 (out+j+3)[0] = restart_index;
13714 (out+j+3)[1] = restart_index;
13715 (out+j+3)[2] = restart_index;
13734 (out+j+0)[0] = (ushort)in[i+1];
13735 (out+j+0)[1] = (ushort)in[i+2];
13736 (out+j+0)[2] = (ushort)in[i+0];
13737 (out+j+3)[0] = (ushort)in[i+2];
13738 (out+j+3)[1] = (ushort)in[i+3];
13739 (out+j+3)[2] = (ushort)in[i+0];
13751 ushort * restrict out = (ushort* restrict)_out;
13757 (out+j+0)[0] = restart_index;
13758 (out+j+0)[1] = restart_index;
13759 (out+j+0)[2] = restart_index;
13760 (out+j+3)[0] = restart_index;
13761 (out+j+3)[1] = restart_index;
13762 (out+j+3)[2] = restart_index;
13781 (out+j+0)[0] = (ushort)in[i+1];
13782 (out+j+0)[1] = (ushort)in[i+3];
13783 (out+j+0)[2] = (ushort)in[i+0];
13784 (out+j+3)[0] = (ushort)in[i+3];
13785 (out+j+3)[1] = (ushort)in[i+2];
13786 (out+j+3)[2] = (ushort)in[i+0];
13798 ushort * restrict out = (ushort* restrict)_out;
13804 (out+j+0)[0] = restart_index;
13805 (out+j+0)[1] = restart_index;
13806 (out+j+0)[2] = restart_index;
13824 (out+j)[0] = (ushort)in[i+1];
13825 (out+j)[1] = (ushort)in[i+2];
13826 (out+j)[2] = (ushort)in[start];
13838 ushort * restrict out = (ushort* restrict)_out;
13842 (out+j)[0] = (ushort)in[i+3];
13843 (out+j)[1] = (ushort)in[i+2];
13844 (out+j)[2] = (ushort)in[i+1];
13845 (out+j)[3] = (ushort)in[i+0];
13857 ushort * restrict out = (ushort* restrict)_out;
13861 (out+j)[0] = (ushort)in[i+3];
13862 (out+j)[1] = (ushort)in[i+2];
13863 (out+j)[2] = (ushort)in[i+1];
13864 (out+j)[3] = (ushort)in[i+0];
13876 ushort * restrict out = (ushort* restrict)_out;
13880 (out+j)[0] = (ushort)in[i+4];
13881 (out+j)[1] = (ushort)in[i+5];
13882 (out+j)[2] = (ushort)in[i+0];
13883 (out+j)[3] = (ushort)in[i+1];
13884 (out+j)[4] = (ushort)in[i+2];
13885 (out+j)[5] = (ushort)in[i+3];
13897 ushort * restrict out = (ushort* restrict)_out;
13903 (out+j)[0] = (ushort)in[i+4];
13904 (out+j)[1] = (ushort)in[i+5];
13905 (out+j)[2] = (ushort)in[i+0];
13906 (out+j)[3] = (ushort)in[i+1];
13907 (out+j)[4] = (ushort)in[i+2];
13908 (out+j)[5] = (ushort)in[i+3];
13911 (out+j)[0] = (ushort)in[i+4];
13912 (out+j)[1] = (ushort)in[i+6];
13913 (out+j)[2] = (ushort)in[i+2];
13914 (out+j)[3] = (ushort)in[i-2];
13915 (out+j)[4] = (ushort)in[i+0];
13916 (out+j)[5] = (ushort)in[i+3];
13929 ushort * restrict out = (ushort* restrict)_out;
13933 (out+j)[0] = (ushort)in[i];
13945 ushort * restrict out = (ushort* restrict)_out;
13949 (out+j)[0] = (ushort)in[i+1];
13950 (out+j)[1] = (ushort)in[i];
13962 ushort * restrict out = (ushort* restrict)_out;
13966 (out+j)[0] = (ushort)in[i+1];
13967 (out+j)[1] = (ushort)in[i];
13979 ushort * restrict out = (ushort* restrict)_out;
13984 (out+j)[0] = (ushort)in[i+1];
13985 (out+j)[1] = (ushort)in[i];
13988 (out+j)[0] = (ushort)in[start];
13989 (out+j)[1] = (ushort)in[end];
14000 ushort * restrict out = (ushort* restrict)_out;
14004 (out+j)[0] = (ushort)in[i+2];
14005 (out+j)[1] = (ushort)in[i];
14006 (out+j)[2] = (ushort)in[i+1];
14018 ushort * restrict out = (ushort* restrict)_out;
14022 (out+j)[0] = (ushort)in[i+2];
14023 (out+j)[1] = (ushort)in[i+(i&1)];
14024 (out+j)[2] = (ushort)in[i+1-(i&1)];
14036 ushort * restrict out = (ushort* restrict)_out;
14040 (out+j)[0] = (ushort)in[i+2];
14041 (out+j)[1] = (ushort)in[start];
14042 (out+j)[2] = (ushort)in[i+1];
14054 ushort * restrict out = (ushort* restrict)_out;
14058 (out+j+0)[0] = (ushort)in[i+3];
14059 (out+j+0)[1] = (ushort)in[i+0];
14060 (out+j+0)[2] = (ushort)in[i+1];
14061 (out+j+3)[0] = (ushort)in[i+3];
14062 (out+j+3)[1] = (ushort)in[i+1];
14063 (out+j+3)[2] = (ushort)in[i+2];
14075 ushort * restrict out = (ushort* restrict)_out;
14079 (out+j+0)[0] = (ushort)in[i+3];
14080 (out+j+0)[1] = (ushort)in[i+2];
14081 (out+j+0)[2] = (ushort)in[i+0];
14082 (out+j+3)[0] = (ushort)in[i+3];
14083 (out+j+3)[1] = (ushort)in[i+0];
14084 (out+j+3)[2] = (ushort)in[i+1];
14096 ushort * restrict out = (ushort* restrict)_out;
14100 (out+j)[0] = (ushort)in[start];
14101 (out+j)[1] = (ushort)in[i+1];
14102 (out+j)[2] = (ushort)in[i+2];
14114 ushort * restrict out = (ushort* restrict)_out;
14118 (out+j)[0] = (ushort)in[i+3];
14119 (out+j)[1] = (ushort)in[i+2];
14120 (out+j)[2] = (ushort)in[i+1];
14121 (out+j)[3] = (ushort)in[i+0];
14133 ushort * restrict out = (ushort* restrict)_out;
14137 (out+j)[0] = (ushort)in[i+3];
14138 (out+j)[1] = (ushort)in[i+2];
14139 (out+j)[2] = (ushort)in[i+1];
14140 (out+j)[3] = (ushort)in[i+0];
14152 ushort * restrict out = (ushort* restrict)_out;
14156 (out+j)[0] = (ushort)in[i+4];
14157 (out+j)[1] = (ushort)in[i+5];
14158 (out+j)[2] = (ushort)in[i+0];
14159 (out+j)[3] = (ushort)in[i+1];
14160 (out+j)[4] = (ushort)in[i+2];
14161 (out+j)[5] = (ushort)in[i+3];
14173 ushort * restrict out = (ushort* restrict)_out;
14179 (out+j)[0] = (ushort)in[i+4];
14180 (out+j)[1] = (ushort)in[i+5];
14181 (out+j)[2] = (ushort)in[i+0];
14182 (out+j)[3] = (ushort)in[i+1];
14183 (out+j)[4] = (ushort)in[i+2];
14184 (out+j)[5] = (ushort)in[i+3];
14187 (out+j)[0] = (ushort)in[i+4];
14188 (out+j)[1] = (ushort)in[i+6];
14189 (out+j)[2] = (ushort)in[i+2];
14190 (out+j)[3] = (ushort)in[i-2];
14191 (out+j)[4] = (ushort)in[i+0];
14192 (out+j)[5] = (ushort)in[i+3];
14205 ushort * restrict out = (ushort* restrict)_out;
14209 (out+j)[0] = (ushort)in[i];
14221 ushort * restrict out = (ushort* restrict)_out;
14225 (out+j)[0] = (ushort)in[i+1];
14226 (out+j)[1] = (ushort)in[i];
14238 ushort * restrict out = (ushort* restrict)_out;
14242 (out+j)[0] = (ushort)in[i+1];
14243 (out+j)[1] = (ushort)in[i];
14255 ushort * restrict out = (ushort* restrict)_out;
14262 (out+j+0)[0] = restart_index;
14263 (out+j+0)[1] = restart_index;
14268 (out+j)[0] = (ushort)in[start];
14269 (out+j)[1] = (ushort)in[end];
14277 (out+j)[0] = (ushort)in[start];
14278 (out+j)[1] = (ushort)in[end];
14284 (out+j)[0] = (ushort)in[i+1];
14285 (out+j)[1] = (ushort)in[i];
14288 (out+j)[0] = (ushort)in[start];
14289 (out+j)[1] = (ushort)in[end];
14300 ushort * restrict out = (ushort* restrict)_out;
14304 (out+j)[0] = (ushort)in[i+2];
14305 (out+j)[1] = (ushort)in[i];
14306 (out+j)[2] = (ushort)in[i+1];
14318 ushort * restrict out = (ushort* restrict)_out;
14322 (out+j)[0] = (ushort)in[i+2];
14323 (out+j)[1] = (ushort)in[i+(i&1)];
14324 (out+j)[2] = (ushort)in[i+1-(i&1)];
14336 ushort * restrict out = (ushort* restrict)_out;
14342 (out+j+0)[0] = restart_index;
14343 (out+j+0)[1] = restart_index;
14344 (out+j+0)[2] = restart_index;
14362 (out+j)[0] = (ushort)in[i+2];
14363 (out+j)[1] = (ushort)in[start];
14364 (out+j)[2] = (ushort)in[i+1];
14376 ushort * restrict out = (ushort* restrict)_out;
14382 (out+j+0)[0] = restart_index;
14383 (out+j+0)[1] = restart_index;
14384 (out+j+0)[2] = restart_index;
14385 (out+j+3)[0] = restart_index;
14386 (out+j+3)[1] = restart_index;
14387 (out+j+3)[2] = restart_index;
14406 (out+j+0)[0] = (ushort)in[i+3];
14407 (out+j+0)[1] = (ushort)in[i+0];
14408 (out+j+0)[2] = (ushort)in[i+1];
14409 (out+j+3)[0] = (ushort)in[i+3];
14410 (out+j+3)[1] = (ushort)in[i+1];
14411 (out+j+3)[2] = (ushort)in[i+2];
14423 ushort * restrict out = (ushort* restrict)_out;
14429 (out+j+0)[0] = restart_index;
14430 (out+j+0)[1] = restart_index;
14431 (out+j+0)[2] = restart_index;
14432 (out+j+3)[0] = restart_index;
14433 (out+j+3)[1] = restart_index;
14434 (out+j+3)[2] = restart_index;
14453 (out+j+0)[0] = (ushort)in[i+3];
14454 (out+j+0)[1] = (ushort)in[i+2];
14455 (out+j+0)[2] = (ushort)in[i+0];
14456 (out+j+3)[0] = (ushort)in[i+3];
14457 (out+j+3)[1] = (ushort)in[i+0];
14458 (out+j+3)[2] = (ushort)in[i+1];
14470 ushort * restrict out = (ushort* restrict)_out;
14476 (out+j+0)[0] = restart_index;
14477 (out+j+0)[1] = restart_index;
14478 (out+j+0)[2] = restart_index;
14496 (out+j)[0] = (ushort)in[start];
14497 (out+j)[1] = (ushort)in[i+1];
14498 (out+j)[2] = (ushort)in[i+2];
14510 ushort * restrict out = (ushort* restrict)_out;
14514 (out+j)[0] = (ushort)in[i+3];
14515 (out+j)[1] = (ushort)in[i+2];
14516 (out+j)[2] = (ushort)in[i+1];
14517 (out+j)[3] = (ushort)in[i+0];
14529 ushort * restrict out = (ushort* restrict)_out;
14533 (out+j)[0] = (ushort)in[i+3];
14534 (out+j)[1] = (ushort)in[i+2];
14535 (out+j)[2] = (ushort)in[i+1];
14536 (out+j)[3] = (ushort)in[i+0];
14548 ushort * restrict out = (ushort* restrict)_out;
14552 (out+j)[0] = (ushort)in[i+4];
14553 (out+j)[1] = (ushort)in[i+5];
14554 (out+j)[2] = (ushort)in[i+0];
14555 (out+j)[3] = (ushort)in[i+1];
14556 (out+j)[4] = (ushort)in[i+2];
14557 (out+j)[5] = (ushort)in[i+3];
14569 ushort * restrict out = (ushort* restrict)_out;
14575 (out+j)[0] = (ushort)in[i+4];
14576 (out+j)[1] = (ushort)in[i+5];
14577 (out+j)[2] = (ushort)in[i+0];
14578 (out+j)[3] = (ushort)in[i+1];
14579 (out+j)[4] = (ushort)in[i+2];
14580 (out+j)[5] = (ushort)in[i+3];
14583 (out+j)[0] = (ushort)in[i+4];
14584 (out+j)[1] = (ushort)in[i+6];
14585 (out+j)[2] = (ushort)in[i+2];
14586 (out+j)[3] = (ushort)in[i-2];
14587 (out+j)[4] = (ushort)in[i+0];
14588 (out+j)[5] = (ushort)in[i+3];
14601 ushort * restrict out = (ushort* restrict)_out;
14605 (out+j)[0] = (ushort)in[i];
14617 ushort * restrict out = (ushort* restrict)_out;
14621 (out+j)[0] = (ushort)in[i];
14622 (out+j)[1] = (ushort)in[i+1];
14634 ushort * restrict out = (ushort* restrict)_out;
14638 (out+j)[0] = (ushort)in[i];
14639 (out+j)[1] = (ushort)in[i+1];
14651 ushort * restrict out = (ushort* restrict)_out;
14656 (out+j)[0] = (ushort)in[i];
14657 (out+j)[1] = (ushort)in[i+1];
14660 (out+j)[0] = (ushort)in[end];
14661 (out+j)[1] = (ushort)in[start];
14672 ushort * restrict out = (ushort* restrict)_out;
14676 (out+j)[0] = (ushort)in[i];
14677 (out+j)[1] = (ushort)in[i+1];
14678 (out+j)[2] = (ushort)in[i+2];
14690 ushort * restrict out = (ushort* restrict)_out;
14694 (out+j)[0] = (ushort)in[i+(i&1)];
14695 (out+j)[1] = (ushort)in[i+1-(i&1)];
14696 (out+j)[2] = (ushort)in[i+2];
14708 ushort * restrict out = (ushort* restrict)_out;
14712 (out+j)[0] = (ushort)in[start];
14713 (out+j)[1] = (ushort)in[i+1];
14714 (out+j)[2] = (ushort)in[i+2];
14726 ushort * restrict out = (ushort* restrict)_out;
14730 (out+j+0)[0] = (ushort)in[i+0];
14731 (out+j+0)[1] = (ushort)in[i+1];
14732 (out+j+0)[2] = (ushort)in[i+3];
14733 (out+j+3)[0] = (ushort)in[i+1];
14734 (out+j+3)[1] = (ushort)in[i+2];
14735 (out+j+3)[2] = (ushort)in[i+3];
14747 ushort * restrict out = (ushort* restrict)_out;
14751 (out+j+0)[0] = (ushort)in[i+2];
14752 (out+j+0)[1] = (ushort)in[i+0];
14753 (out+j+0)[2] = (ushort)in[i+3];
14754 (out+j+3)[0] = (ushort)in[i+0];
14755 (out+j+3)[1] = (ushort)in[i+1];
14756 (out+j+3)[2] = (ushort)in[i+3];
14768 ushort * restrict out = (ushort* restrict)_out;
14772 (out+j)[0] = (ushort)in[i+1];
14773 (out+j)[1] = (ushort)in[i+2];
14774 (out+j)[2] = (ushort)in[start];
14786 ushort * restrict out = (ushort* restrict)_out;
14790 (out+j)[0] = (ushort)in[i+0];
14791 (out+j)[1] = (ushort)in[i+1];
14792 (out+j)[2] = (ushort)in[i+2];
14793 (out+j)[3] = (ushort)in[i+3];
14805 ushort * restrict out = (ushort* restrict)_out;
14809 (out+j)[0] = (ushort)in[i+0];
14810 (out+j)[1] = (ushort)in[i+1];
14811 (out+j)[2] = (ushort)in[i+2];
14812 (out+j)[3] = (ushort)in[i+3];
14824 ushort * restrict out = (ushort* restrict)_out;
14828 (out
14829 (out+j)[1] = (ushort)in[i+1];
14830 (out+j)[2] = (ushort)in[i+2];
14831 (out+j)[3] = (ushort)in[i+3];
14832 (out+j)[4] = (ushort)in[i+4];
14833 (out+j)[5] = (ushort)in[i+5];
14845 ushort * restrict out = (ushort* restrict)_out;
14851 (out+j)[0] = (ushort)in[i+0];
14852 (out+j)[1] = (ushort)in[i+1];
14853 (out+j)[2] = (ushort)in[i+2];
14854 (out+j)[3] = (ushort)in[i+3];
14855 (out+j)[4] = (ushort)in[i+4];
14856 (out+j)[5] = (ushort)in[i+5];
14859 (out+j)[0] = (ushort)in[i+2];
14860 (out+j)[1] = (ushort)in[i-2];
14861 (out+j)[2] = (ushort)in[i+0];
14862 (out+j)[3] = (ushort)in[i+3];
14863 (out+j)[4] = (ushort)in[i+4];
14864 (out+j)[5] = (ushort)in[i+6];
14877 ushort * restrict out = (ushort* restrict)_out;
14881 (out+j)[0] = (ushort)in[i];
14893 ushort * restrict out = (ushort* restrict)_out;
14897 (out+j)[0] = (ushort)in[i];
14898 (out+j)[1] = (ushort)in[i+1];
14910 ushort * restrict out = (ushort* restrict)_out;
14914 (out+j)[0] = (ushort)in[i];
14915 (out+j)[1] = (ushort)in[i+1];
14927 ushort * restrict out = (ushort* restrict)_out;
14934 (out+j+0)[0] = restart_index;
14935 (out+j+0)[1] = restart_index;
14940 (out+j)[0] = (ushort)in[end];
14941 (out+j)[1] = (ushort)in[start];
14949 (out+j)[0] = (ushort)in[end];
14950 (out+j)[1] = (ushort)in[start];
14956 (out+j)[0] = (ushort)in[i];
14957 (out+j)[1] = (ushort)in[i+1];
14960 (out+j)[0] = (ushort)in[end];
14961 (out+j)[1] = (ushort)in[start];
14972 ushort * restrict out = (ushort* restrict)_out;
14976 (out+j)[0] = (ushort)in[i];
14977 (out+j)[1] = (ushort)in[i+1];
14978 (out+j)[2] = (ushort)in[i+2];
14990 ushort * restrict out = (ushort* restrict)_out;
14994 (out+j)[0] = (ushort)in[i+(i&1)];
14995 (out+j)[1] = (ushort)in[i+1-(i&1)];
14996 (out+j)[2] = (ushort)in[i+2];
15008 ushort * restrict out = (ushort* restrict)_out;
15014 (out+j+0)[0] = restart_index;
15015 (out+j+0)[1] = restart_index;
15016 (out+j+0)[2] = restart_index;
15034 (out+j)[0] = (ushort)in[start];
15035 (out+j)[1] = (ushort)in[i+1];
15036 (out+j)[2] = (ushort)in[i+2];
15048 ushort * restrict out = (ushort* restrict)_out;
15054 (out+j+0)[0] = restart_index;
15055 (out+j+0)[1] = restart_index;
15056 (out+j+0)[2] = restart_index;
15057 (out+j+3)[0] = restart_index;
15058 (out+j+3)[1] = restart_index;
15059 (out+j+3)[2] = restart_index;
15078 (out+j+0)[0] = (ushort)in[i+0];
15079 (out+j+0)[1] = (ushort)in[i+1];
15080 (out+j+0)[2] = (ushort)in[i+3];
15081 (out+j+3)[0] = (ushort)in[i+1];
15082 (out+j+3)[1] = (ushort)in[i+2];
15083 (out+j+3)[2] = (ushort)in[i+3];
15095 ushort * restrict out = (ushort* restrict)_out;
15101 (out+j+0)[0] = restart_index;
15102 (out+j+0)[1] = restart_index;
15103 (out+j+0)[2] = restart_index;
15104 (out+j+3)[0] = restart_index;
15105 (out+j+3)[1] = restart_index;
15106 (out+j+3)[2] = restart_index;
15125 (out+j+0)[0] = (ushort)in[i+2];
15126 (out+j+0)[1] = (ushort)in[i+0];
15127 (out+j+0)[2] = (ushort)in[i+3];
15128 (out+j+3)[0] = (ushort)in[i+0];
15129 (out+j+3)[1] = (ushort)in[i+1];
15130 (out+j+3)[2] = (ushort)in[i+3];
15142 ushort * restrict out = (ushort* restrict)_out;
15148 (out+j+0)[0] = restart_index;
15149 (out+j+0)[1] = restart_index;
15150 (out+j+0)[2] = restart_index;
15168 (out+j)[0] = (ushort)in[i+1];
15169 (out+j)[1] = (ushort)in[i+2];
15170 (out+j)[2] = (ushort)in[start];
15182 ushort * restrict out = (ushort* restrict)_out;
15186 (out+j)[0] = (ushort)in[i+0];
15187 (out+j)[1] = (ushort)in[i+1];
15188 (out+j)[2] = (ushort)in[i+2];
15189 (out+j)[3] = (ushort)in[i+3];
15201 ushort * restrict out = (ushort* restrict)_out;
15205 (out+j)[0] = (ushort)in[i+0];
15206 (out+j)[1] = (ushort)in[i+1];
15207 (out+j)[2] = (ushort)in[i+2];
15208 (out+j)[3] = (ushort)in[i+3];
15220 ushort * restrict out = (ushort* restrict)_out;
15224 (out+j)[0] = (ushort)in[i+0];
15225 (out+j)[1] = (ushort)in[i+1];
15226 (out+j)[2] = (ushort)in[i+2];
15227 (out+j)[3] = (ushort)in[i+3];
15228 (out+j)[4] = (ushort)in[i+4];
15229 (out+j)[5] = (ushort)in[i+5];
15241 ushort * restrict out = (ushort* restrict)_out;
15247 (out+j)[0] = (ushort)in[i+0];
15248 (out+j)[1] = (ushort)in[i+1];
15249 (out+j)[2] = (ushort)in[i+2];
15250 (out+j)[3] = (ushort)in[i+3];
15251 (out+j)[4] = (ushort)in[i+4];
15252 (out+j)[5] = (ushort)in[i+5];
15255 (out+j)[0] = (ushort)in[i+2];
15256 (out+j)[1] = (ushort)in[i-2];
15257 (out+j)[2] = (ushort)in[i+0];
15258 (out+j)[3] = (ushort)in[i+3];
15259 (out+j)[4] = (ushort)in[i+4];
15260 (out+j)[5] = (ushort)in[i+6];
15273 uint * restrict out = (uint* restrict)_out;
15277 (out+j)[0] = (uint)in[i];
15289 uint * restrict out = (uint* restrict)_out;
15293 (out+j)[0] = (uint)in[i];
15294 (out+j)[1] = (uint)in[i+1];
15306 uint * restrict out = (uint* restrict)_out;
15310 (out+j)[0] = (uint)in[i];
15311 (out+j)[1] = (uint)in[i+1];
15323 uint * restrict out = (uint* restrict)_out;
15328 (out+j)[0] = (uint)in[i];
15329 (out+j)[1] = (uint)in[i+1];
15332 (out+j)[0] = (uint)in[end];
15333 (out+j)[1] = (uint)in[start];
15344 uint * restrict out = (uint* restrict)_out;
15348 (out+j)[0] = (uint)in[i];
15349 (out+j)[1] = (uint)in[i+1];
15350 (out+j)[2] = (uint)in[i+2];
15362 uint * restrict out = (uint* restrict)_out;
15366 (out+j)[0] = (uint)in[i];
15367 (out+j)[1] = (uint)in[i+1+(i&1)];
15368 (out+j)[2] = (uint)in[i+2-(i&1)];
15380 uint * restrict out = (uint* restrict)_out;
15384 (out+j)[0] = (uint)in[i+1];
15385 (out+j)[1] = (uint)in[i+2];
15386 (out+j)[2] = (uint)in[start];
15398 uint * restrict out = (uint* restrict)_out;
15402 (out+j+0)[0] = (uint)in[i+0];
15403 (out+j+0)[1] = (uint)in[i+1];
15404 (out+j+0)[2] = (uint)in[i+2];
15405 (out+j+3)[0] = (uint)in[i+0];
15406 (out+j+3)[1] = (uint)in[i+2];
15407 (out+j+3)[2] = (uint)in[i+3];
15419 uint * restrict out = (uint* restrict)_out;
15423 (out+j+0)[0] = (uint)in[i+0];
15424 (out+j+0)[1] = (uint)in[i+1];
15425 (out+j+0)[2] = (uint)in[i+3];
15426 (out+j+3)[0] = (uint)in[i+0];
15427 (out+j+3)[1] = (uint)in[i+3];
15428 (out+j+3)[2] = (uint)in[i+2];
15440 uint * restrict out = (uint* restrict)_out;
15444 (out+j)[0] = (uint)in[start];
15445 (out+j)[1] = (uint)in[i+1];
15446 (out+j)[2] = (uint)in[i+2];
15458 uint * restrict out = (uint* restrict)_out;
15462 (out+j)[0] = (uint)in[i+0];
15463 (out+j)[1] = (uint)in[i+1];
15464 (out+j)[2] = (uint)in[i+2];
15465 (out+j)[3] = (uint)in[i+3];
15477 uint * restrict out = (uint* restrict)_out;
15481 (out+j)[0] = (uint)in[i+0];
15482 (out+j)[1] = (uint)in[i+1];
15483 (out+j)[2] = (uint)in[i+2];
15484 (out+j)[3] = (uint)in[i+3];
15496 uint * restrict out = (uint* restrict)_out;
15500 (out+j)[0] = (uint)in[i+0];
15501 (out+j)[1] = (uint)in[i+1];
15502 (out+j)[2] = (uint)in[i+2];
15503 (out+j)[3] = (uint)in[i+3];
15504 (out+j)[4] = (uint)in[i+4];
15505 (out+j)[5] = (uint)in[i+5];
15517 uint * restrict out = (uint* restrict)_out;
15523 (out+j)[0] = (uint)in[i+0];
15524 (out+j)[1] = (uint)in[i+1];
15525 (out+j)[2] = (uint)in[i+2];
15526 (out+j)[3] = (uint)in[i+3];
15527 (out+j)[4] = (uint)in[i+4];
15528 (out+j)[5] = (uint)in[i+5];
15531 (out+j)[0] = (uint)in[i+2];
15532 (out+j)[1] = (uint)in[i-2];
15533 (out+j)[2] = (uint)in[i+0];
15534 (out+j)[3] = (uint)in[i+3];
15535 (out+j)[4] = (uint)in[i+4];
15536 (out+j)[5] = (uint)in[i+6];
15549 uint * restrict out = (uint* restrict)_out;
15553 (out+j)[0] = (uint)in[i];
15565 uint * restrict out = (uint* restrict)_out;
15569 (out+j)[0] = (uint)in[i];
15570 (out+j)[1] = (uint)in[i+1];
15582 uint * restrict out = (uint* restrict)_out;
15586 (out+j)[0] = (uint)in[i];
15587 (out+j)[1] = (uint)in[i+1];
15599 uint * restrict out = (uint* restrict)_out;
15606 (out+j+0)[0] = restart_index;
15607 (out+j+0)[1] = restart_index;
15612 (out+j)[0] = (uint)in[end];
15613 (out+j)[1] = (uint)in[start];
15621 (out+j)[0] = (uint)in[end];
15622 (out+j)[1] = (uint)in[start];
15628 (out+j)[0] = (uint)in[i];
15629 (out+j)[1] = (uint)in[i+1];
15632 (out+j)[0] = (uint)in[end];
15633 (out+j)[1] = (uint)in[start];
15644 uint * restrict out = (uint* restrict)_out;
15648 (out+j)[0] = (uint)in[i];
15649 (out+j)[1] = (uint)in[i+1];
15650 (out+j)[2] = (uint)in[i+2];
15662 uint * restrict out = (uint* restrict)_out;
15666 (out+j)[0] = (uint)in[i];
15667 (out+j)[1] = (uint)in[i+1+(i&1)];
15668 (out+j)[2] = (uint)in[i+2-(i&1)];
15680 uint * restrict out = (uint* restrict)_out;
15686 (out+j+0)[0] = restart_index;
15687 (out+j+0)[1] = restart_index;
15688 (out+j+0)[2] = restart_index;
15706 (out+j)[0] = (uint)in[i+1];
15707 (out+j)[1] = (uint)in[i+2];
15708 (out+j)[2] = (uint)in[start];
15720 uint * restrict out = (uint* restrict)_out;
15726 (out+j+0)[0] = restart_index;
15727 (out+j+0)[1] = restart_index;
15728 (out+j+0)[2] = restart_index;
15729 (out+j+3)[0] = restart_index;
15730 (out+j+3)[1] = restart_index;
15731 (out+j+3)[2] = restart_index;
15750 (out+j+0)[0] = (uint)in[i+0];
15751 (out+j+0)[1] = (uint)in[i+1];
15752 (out+j+0)[2] = (uint)in[i+2];
15753 (out+j+3)[0] = (uint)in[i+0];
15754 (out+j+3)[1] = (uint)in[i+2];
15755 (out+j+3)[2] = (uint)in[i+3];
15767 uint * restrict out = (uint* restrict)_out;
15773 (out+j+0)[0] = restart_index;
15774 (out+j+0)[1] = restart_index;
15775 (out+j+0)[2] = restart_index;
15776 (out+j+3)[0] = restart_index;
15777 (out+j+3)[1] = restart_index;
15778 (out+j+3)[2] = restart_index;
15797 (out+j+0)[0] = (uint)in[i+0];
15798 (out+j+0)[1] = (uint)in[i+1];
15799 (out+j+0)[2] = (uint)in[i+3];
15800 (out+j+3)[0] = (uint)in[i+0];
15801 (out+j+3)[1] = (uint)in[i+3];
15802 (out+j+3)[2] = (uint)in[i+2];
15814 uint * restrict out = (uint* restrict)_out;
15820 (out+j+0)[0] = restart_index;
15821 (out+j+0)[1] = restart_index;
15822 (out+j+0)[2] = restart_index;
15840 (out+j)[0] = (uint)in[start];
15841 (out+j)[1] = (uint)in[i+1];
15842 (out+j)[2] = (uint)in[i+2];
15854 uint * restrict out = (uint* restrict)_out;
15858 (out+j)[0] = (uint)in[i+0];
15859 (out+j)[1] = (uint)in[i+1];
15860 (out+j)[2] = (uint)in[i+2];
15861 (out+j)[3] = (uint)in[i+3];
15873 uint * restrict out = (uint* restrict)_out;
15877 (out+j)[0] = (uint)in[i+0];
15878 (out+j)[1] = (uint)in[i+1];
15879 (out+j)[2] = (uint)in[i+2];
15880 (out+j)[3] = (uint)in[i+3];
15892 uint * restrict out = (uint* restrict)_out;
15896 (out+j)[0] = (uint)in[i+0];
15897 (out+j)[1] = (uint)in[i+1];
15898 (out+j)[2] = (uint)in[i+2];
15899 (out+j)[3] = (uint)in[i+3];
15900 (out+j)[4] = (uint)in[i+4];
15901 (out+j)[5] = (uint)in[i+5];
15913 uint * restrict out = (uint* restrict)_out;
15919 (out+j)[0] = (uint)in[i+0];
15920 (out+j)[1] = (uint)in[i+1];
15921 (out+j)[2] = (uint)in[i+2];
15922 (out+j)[3] = (uint)in[i+3];
15923 (out+j)[4] = (uint)in[i+4];
15924 (out+j)[5] = (uint)in[i+5];
15927 (out+j)[0] = (uint)in[i+2];
15928 (out+j)[1] = (uint)in[i-2];
15929 (out+j)[2] = (uint)in[i+0];
15930 (out+j)[3] = (uint)in[i+3];
15931 (out+j)[4] = (uint)in[i+4];
15932 (out+j)[5] = (uint)in[i+6];
15945 uint * restrict out = (uint* restrict)_out;
15949 (out+j)[0] = (uint)in[i];
15961 uint * restrict out = (uint* restrict)_out;
15965 (out+j)[0] = (uint)in[i+1];
15966 (out+j)[1] = (uint)in[i];
15978 uint * restrict out = (uint* restrict)_out;
15982 (out+j)[0] = (uint)in[i+1];
15983 (out+j)[1] = (uint)in[i];
15995 uint * restrict out = (uint* restrict)_out;
16000 (out+j)[0] = (uint)in[i+1];
16001 (out+j)[1] = (uint)in[i];
16004 (out+j)[0] = (uint)in[start];
16005 (out+j)[1] = (uint)in[end];
16016 uint * restrict out = (uint* restrict)_out;
16020 (out+j)[0] = (uint)in[i+1];
16021 (out+j)[1] = (uint)in[i+2];
16022 (out+j)[2] = (uint)in[i];
16034 uint * restrict out = (uint* restrict)_out;
16038 (out+j)[0] = (uint)in[i+1+(i&1)];
16039 (out+j)[1] = (uint)in[i+2-(i&1)];
16040 (out+j)[2] = (uint)in[i];
16052 uint * restrict out = (uint* restrict)_out;
16056 (out+j)[0] = (uint)in[i+2];
16057 (out+j)[1] = (uint)in[start];
16058 (out+j)[2] = (uint)in[i+1];
16070 uint * restrict out = (uint* restrict)_out;
16074 (out+j+0)[0] = (uint)in[i+1];
16075 (out+j+0)[1] = (uint)in[i+2];
16076 (out+j+0)[2] = (uint)in[i+0];
16077 (out+j+3)[0] = (uint)in[i+2];
16078 (out+j+3)[1] = (uint)in[i+3];
16079 (out+j+3)[2] = (uint)in[i+0];
16091 uint * restrict out = (uint* restrict)_out;
16095 (out+j+0)[0] = (uint)in[i+1];
16096 (out+j+0)[1] = (uint)in[i+3];
16097 (out+j+0)[2] = (uint)in[i+0];
16098 (out+j+3)[0] = (uint)in[i+3];
16099 (out+j+3)[1] = (uint)in[i+2];
16100 (out+j+3)[2] = (uint)in[i+0];
16112 uint * restrict out = (uint* restrict)_out;
16116 (out+j)[0] = (uint)in[i+1];
16117 (out+j)[1] = (uint)in[i+2];
16118 (out+j)[2] = (uint)in[start];
16130 uint * restrict out = (uint* restrict)_out;
16134 (out+j)[0] = (uint)in[i+3];
16135 (out+j)[1] = (uint)in[i+2];
16136 (out+j)[2] = (uint)in[i+1];
16137 (out+j)[3] = (uint)in[i+0];
16149 uint * restrict out = (uint* restrict)_out;
16153 (out+j)[0] = (uint)in[i+3];
16154 (out+j)[1] = (uint)in[i+2];
16155 (out+j)[2] = (uint)in[i+1];
16156 (out+j)[3] = (uint)in[i+0];
16168 uint * restrict out = (uint* restrict)_out;
16172 (out+j)[0] = (uint)in[i+4];
16173 (out+j)[1] = (uint)in[i+5];
16174 (out+j)[2] = (uint)in[i+0];
16175 (out+j)[3] = (uint)in[i+1];
16176 (out+j)[4] = (uint)in[i+2];
16177 (out+j)[5] = (uint)in[i+3];
16189 uint * restrict out = (uint* restrict)_out;
16195 (out+j)[0] = (uint)in[i+4];
16196 (out+j)[1] = (uint)in[i+5];
16197 (out+j)[2] = (uint)in[i+0];
16198 (out+j)[3] = (uint)in[i+1];
16199 (out+j)[4] = (uint)in[i+2];
16200 (out+j)[5] = (uint)in[i+3];
16203 (out+j)[0] = (uint)in[i+4];
16204 (out+j)[1] = (uint)in[i+6];
16205 (out+j)[2] = (uint)in[i+2];
16206 (out+j)[3] = (uint)in[i-2];
16207 (out+j)[4] = (uint)in[i+0];
16208 (out+j)[5] = (uint)in[i+3];
16221 uint * restrict out = (uint* restrict)_out;
16225 (out+j)[0] = (uint)in[i];
16237 uint * restrict out = (uint* restrict)_out;
16241 (out+j)[0] = (uint)in[i+1];
16242 (out+j)[1] = (uint)in[i];
16254 uint * restrict out = (uint* restrict)_out;
16258 (out+j)[0] = (uint)in[i+1];
16259 (out+j)[1] = (uint)in[i];
16271 uint * restrict out = (uint* restrict)_out;
16278 (out+j+0)[0] = restart_index;
16279 (out+j+0)[1] = restart_index;
16284 (out+j)[0] = (uint)in[start];
16285 (out+j)[1] = (uint)in[end];
16293 (out+j)[0] = (uint)in[start];
16294 (out+j)[1] = (uint)in[end];
16300 (out+j)[0] = (uint)in[i+1];
16301 (out+j)[1] = (uint)in[i];
16304 (out+j)[0] = (uint)in[start];
16305 (out+j)[1] = (uint)in[end];
16316 uint * restrict out = (uint* restrict)_out;
16320 (out+j)[0] = (uint)in[i+1];
16321 (out+j)[1] = (uint)in[i+2];
16322 (out+j)[2] = (uint)in[i];
16334 uint * restrict out = (uint* restrict)_out;
16338 (out+j)[0] = (uint)in[i+1+(i&1)];
16339 (out+j)[1] = (uint)in[i+2-(i&1)];
16340 (out+j)[2] = (uint)in[i];
16352 uint * restrict out = (uint* restrict)_out;
16358 (out+j+0)[0] = restart_index;
16359 (out+j+0)[1] = restart_index;
16360 (out+j+0)[2] = restart_index;
16378 (out+j)[0] = (uint)in[i+2];
16379 (out+j)[1] = (uint)in[start];
16380 (out+j)[2] = (uint)in[i+1];
16392 uint * restrict out = (uint* restrict)_out;
16398 (out+j+0)[0] = restart_index;
16399 (out+j+0)[1] = restart_index;
16400 (out+j+0)[2] = restart_index;
16401 (out+j+3)[0] = restart_index;
16402 (out+j+3)[1] = restart_index;
16403 (out+j+3)[2] = restart_index;
16422 (out+j+0)[0] = (uint)in[i+1];
16423 (out+j+0)[1] = (uint)in[i+2];
16424 (out+j+0)[2] = (uint)in[i+0];
16425 (out+j+3)[0] = (uint)in[i+2];
16426 (out+j+3)[1] = (uint)in[i+3];
16427 (out+j+3)[2] = (uint)in[i+0];
16439 uint * restrict out = (uint* restrict)_out;
16445 (out+j+0)[0] = restart_index;
16446 (out+j+0)[1] = restart_index;
16447 (out+j+0)[2] = restart_index;
16448 (out+j+3)[0] = restart_index;
16449 (out+j+3)[1] = restart_index;
16450 (out+j+3)[2] = restart_index;
16469 (out+j+0)[0] = (uint)in[i+1];
16470 (out+j+0)[1] = (uint)in[i+3];
16471 (out+j+0)[2] = (uint)in[i+0];
16472 (out+j+3)[0] = (uint)in[i+3];
16473 (out+j+3)[1] = (uint)in[i+2];
16474 (out+j+3)[2] = (uint)in[i+0];
16486 uint * restrict out = (uint* restrict)_out;
16492 (out+j+0)[0] = restart_index;
16493 (out+j+0)[1] = restart_index;
16494 (out+j+0)[2] = restart_index;
16512 (out+j)[0] = (uint)in[i+1];
16513 (out+j)[1] = (uint)in[i+2];
16514 (out+j)[2] = (uint)in[start];
16526 uint * restrict out = (uint* restrict)_out;
16530 (out+j)[0] = (uint)in[i+3];
16531 (out+j)[1] = (uint)in[i+2];
16532 (out+j)[2] = (uint)in[i+1];
16533 (out+j)[3] = (uint)in[i+0];
16545 uint * restrict out = (uint* restrict)_out;
16549 (out+j)[0] = (uint)in[i+3];
16550 (out+j)[1] = (uint)in[i+2];
16551 (out+j)[2] = (uint)in[i+1];
16552 (out+j)[3] = (uint)in[i+0];
16564 uint * restrict out = (uint* restrict)_out;
16568 (out+j)[0] = (uint)in[i+4];
16569 (out+j)[1] = (uint)in[i+5];
16570 (out+j)[2] = (uint)in[i+0];
16571 (out+j)[3] = (uint)in[i+1];
16572 (out+j)[4] = (uint)in[i+2];
16573 (out+j)[5] = (uint)in[i+3];
16585 uint * restrict out = (uint* restrict)_out;
16591 (out+j)[0] = (uint)in[i+4];
16592 (out+j)[1] = (uint)in[i+5];
16593 (out+j)[2] = (uint)in[i+0];
16594 (out+j)[3] = (uint)in[i+1];
16595 (out+j)[4] = (uint)in[i+2];
16596 (out+j)[5] = (uint)in[i+3];
16599 (out+j)[0] = (uint)in[i+4];
16600 (out+j)[1] = (uint)in[i+6];
16601 (out+j)[2] = (uint)in[i+2];
16602 (out+j)[3] = (uint)in[i-2];
16603 (out+j)[4] = (uint)in[i+0];
16604 (out+j)[5] = (uint)in[i+3];
16617 uint * restrict out = (uint* restrict)_out;
16621 (out+j)[0] = (uint)in[i];
16633 uint * restrict out = (uint* restrict)_out;
16637 (out+j)[0] = (uint)in[i+1];
16638 (out+j)[1] = (uint)in[i];
16650 uint * restrict out = (uint* restrict)_out;
16654 (out+j)[0] = (uint)in[i+1];
16655 (out+j)[1] = (uint)in[i];
16667 uint * restrict out = (uint* restrict)_out;
16672 (out+j)[0] = (uint)in[i+1];
16673 (out+j)[1] = (uint)in[i];
16676 (out+j)[0] = (uint)in[start];
16677 (out+j)[1] = (uint)in[end];
16688 uint * restrict out = (uint* restrict)_out;
16692 (out+j)[0] = (uint)in[i+2];
16693 (out+j)[1] = (uint)in[i];
16694 (out+j)[2] = (uint)in[i+1];
16706 uint * restrict out = (uint* restrict)_out;
16710 (out+j)[0] = (uint)in[i+2];
16711 (out+j)[1] = (uint)in[i+(i&1)];
16712 (out+j)[2] = (uint)in[i+1-(i&1)];
16724 uint * restrict out = (uint* restrict)_out;
16728 (out+j)[0] = (uint)in[i+2];
16729 (out+j)[1] = (uint)in[start];
16730 (out+j)[2] = (uint)in[i+1];
16742 uint * restrict out = (uint* restrict)_out;
16746 (out+j+0)[0] = (uint)in[i+3];
16747 (out+j+0)[1] = (uint)in[i+0];
16748 (out+j+0)[2] = (uint)in[i+1];
16749 (out+j+3)[0] = (uint)in[i+3];
16750 (out+j+3)[1] = (uint)in[i+1];
16751 (out+j+3)[2] = (uint)in[i+2];
16763 uint * restrict out = (uint* restrict)_out;
16767 (out+j+0)[0] = (uint)in[i+3];
16768 (out+j+0)[1] = (uint)in[i+2];
16769 (out+j+0)[2] = (uint)in[i+0];
16770 (out+j+3)[0] = (uint)in[i+3];
16771 (out+j+3)[1] = (uint)in[i+0];
16772 (out+j+3)[2] = (uint)in[i+1];
16784 uint * restrict out = (uint* restrict)_out;
16788 (out+j)[0] = (uint)in[start];
16789 (out+j)[1] = (uint)in[i+1];
16790 (out+j)[2] = (uint)in[i+2];
16802 uint * restrict out = (uint* restrict)_out;
16806 (out+j)[0] = (uint)in[i+3];
16807 (out+j)[1] = (uint)in[i+2];
16808 (out+j)[2] = (uint)in[i+1];
16809 (out+j)[3] = (uint)in[i+0];
16821 uint * restrict out = (uint* restrict)_out;
16825 (out+j)[0] = (uint)in[i+3];
16826 (out+j)[1] = (uint)in[i+2];
16827 (out+j)[2] = (uint)in[i+1];
16828 (out+j)[3] = (uint)in[i+0];
16840 uint * restrict out = (uint* restrict)_out;
16844 (out+j)[0] = (uint)in[i+4];
16845 (out+j)[1] = (uint)in[i+5];
16846 (out+j)[2] = (uint)in[i+0];
16847 (out+j)[3] = (uint)in[i+1];
16848 (out+j)[4] = (uint)in[i+2];
16849 (out+j)[5] = (uint)in[i+3];
16861 uint * restrict out = (uint* restrict)_out;
16867 (out+j)[0] = (uint)in[i+4];
16868 (out+j)[1] = (uint)in[i+5];
16869 (out+j)[2] = (uint)in[i+0];
16870 (out+j)[3] = (uint)in[i+1];
16871 (out+j)[4] = (uint)in[i+2];
16872 (out+j)[5] = (uint)in[i+3];
16875 (out+j)[0] = (uint)in[i+4];
16876 (out+j)[1] = (uint)in[i+6];
16877 (out+j)[2] = (uint)in[i+2];
16878 (out+j)[3] = (uint)in[i-2];
16879 (out+j)[4] = (uint)in[i+0];
16880 (out+j)[5] = (uint)in[i+3];
16893 uint * restrict out = (uint* restrict)_out;
16897 (out+j)[0] = (uint)in[i];
16909 uint * restrict out = (uint* restrict)_out;
16913 (out+j)[0] = (uint)in[i+1];
16914 (out+j)[1] = (uint)in[i];
16926 uint * restrict out = (uint* restrict)_out;
16930 (out+j)[0] = (uint)in[i+1];
16931 (out+j)[1] = (uint)in[i];
16943 uint * restrict out = (uint* restrict)_out;
16950 (out+j+0)[0] = restart_index;
16951 (out+j+0)[1] = restart_index;
16956 (out+j)[0] = (uint)in[start];
16957 (out+j)[1] = (uint)in[end];
16965 (out+j)[0] = (uint)in[start];
16966 (out+j)[1] = (uint)in[end];
16972 (out+j)[0] = (uint)in[i+1];
16973 (out+j)[1] = (uint)in[i];
16976 (out+j)[0] = (uint)in[start];
16977 (out+j)[1] = (uint)in[end];
16988 uint * restrict out = (uint* restrict)_out;
16992 (out+j)[0] = (uint)in[i+2];
16993 (out+j)[1] = (uint)in[i];
16994 (out+j)[2] = (uint)in[i+1];
17006 uint * restrict out = (uint* restrict)_out;
17010 (out+j)[0] = (uint)in[i+2];
17011 (out+j)[1] = (uint)in[i+(i&1)];
17012 (out+j)[2] = (uint)in[i+1-(i&1)];
17024 uint * restrict out = (uint* restrict)_out;
17030 (out+j+0)[0] = restart_index;
17031 (out+j+0)[1] = restart_index;
17032 (out+j+0)[2] = restart_index;
17050 (out+j)[0] = (uint)in[i+2];
17051 (out+j)[1] = (uint)in[start];
17052 (out+j)[2] = (uint)in[i+1];
17064 uint * restrict out = (uint* restrict)_out;
17070 (out+j+0)[0] = restart_index;
17071 (out+j+0)[1] = restart_index;
17072 (out+j+0)[2] = restart_index;
17073 (out+j+3)[0] = restart_index;
17074 (out+j+3)[1] = restart_index;
17075 (out+j+3)[2] = restart_index;
17094 (out+j+0)[0] = (uint)in[i+3];
17095 (out+j+0)[1] = (uint)in[i+0];
17096 (out+j+0)[2] = (uint)in[i+1];
17097 (out+j+3)[0] = (uint)in[i+3];
17098 (out+j+3)[1] = (uint)in[i+1];
17099 (out+j+3)[2] = (uint)in[i+2];
17111 uint * restrict out = (uint* restrict)_out;
17117 (out+j+0)[0] = restart_index;
17118 (out+j+0)[1] = restart_index;
17119 (out+j+0)[2] = restart_index;
17120 (out+j+3)[0] = restart_index;
17121 (out+j+3)[1] = restart_index;
17122 (out+j+3)[2] = restart_index;
17141 (out+j+0)[0] = (uint)in[i+3];
17142 (out+j+0)[1] = (uint)in[i+2];
17143 (out+j+0)[2] = (uint)in[i+0];
17144 (out+j+3)[0] = (uint)in[i+3];
17145 (out+j+3)[1] = (uint)in[i+0];
17146 (out+j+3)[2] = (uint)in[i+1];
17158 uint * restrict out = (uint* restrict)_out;
17164 (out+j+0)[0] = restart_index;
17165 (out+j+0)[1] = restart_index;
17166 (out+j+0)[2] = restart_index;
17184 (out+j)[0] = (uint)in[start];
17185 (out+j)[1] = (uint)in[i+1];
17186 (out+j)[2] = (uint)in[i+2];
17198 uint * restrict out = (uint* restrict)_out;
17202 (out+j)[0] = (uint)in[i+3];
17203 (out+j)[1] = (uint)in[i+2];
17204 (out+j)[2] = (uint)in[i+1];
17205 (out+j)[3] = (uint)in[i+0];
17217 uint * restrict out = (uint* restrict)_out;
17221 (out+j)[0] = (uint)in[i+3];
17222 (out+j)[1] = (uint)in[i+2];
17223 (out
17224 (out+j)[3] = (uint)in[i+0];
17236 uint * restrict out = (uint* restrict)_out;
17240 (out+j)[0] = (uint)in[i+4];
17241 (out+j)[1] = (uint)in[i+5];
17242 (out+j)[2] = (uint)in[i+0];
17243 (out+j)[3] = (uint)in[i+1];
17244 (out+j)[4] = (uint)in[i+2];
17245 (out+j)[5] = (uint)in[i+3];
17257 uint * restrict out = (uint* restrict)_out;
17263 (out+j)[0] = (uint)in[i+4];
17264 (out+j)[1] = (uint)in[i+5];
17265 (out+j)[2] = (uint)in[i+0];
17266 (out+j)[3] = (uint)in[i+1];
17267 (out+j)[4] = (uint)in[i+2];
17268 (out+j)[5] = (uint)in[i+3];
17271 (out+j)[0] = (uint)in[i+4];
17272 (out+j)[1] = (uint)in[i+6];
17273 (out+j)[2] = (uint)in[i+2];
17274 (out+j)[3] = (uint)in[i-2];
17275 (out+j)[4] = (uint)in[i+0];
17276 (out+j)[5] = (uint)in[i+3];
17289 uint * restrict out = (uint* restrict)_out;
17293 (out+j)[0] = (uint)in[i];
17305 uint * restrict out = (uint* restrict)_out;
17309 (out+j)[0] = (uint)in[i];
17310 (out+j)[1] = (uint)in[i+1];
17322 uint * restrict out = (uint* restrict)_out;
17326 (out+j)[0] = (uint)in[i];
17327 (out+j)[1] = (uint)in[i+1];
17339 uint * restrict out = (uint* restrict)_out;
17344 (out+j)[0] = (uint)in[i];
17345 (out+j)[1] = (uint)in[i+1];
17348 (out+j)[0] = (uint)in[end];
17349 (out+j)[1] = (uint)in[start];
17360 uint * restrict out = (uint* restrict)_out;
17364 (out+j)[0] = (uint)in[i];
17365 (out+j)[1] = (uint)in[i+1];
17366 (out+j)[2] = (uint)in[i+2];
17378 uint * restrict out = (uint* restrict)_out;
17382 (out+j)[0] = (uint)in[i+(i&1)];
17383 (out+j)[1] = (uint)in[i+1-(i&1)];
17384 (out+j)[2] = (uint)in[i+2];
17396 uint * restrict out = (uint* restrict)_out;
17400 (out+j)[0] = (uint)in[start];
17401 (out+j)[1] = (uint)in[i+1];
17402 (out+j)[2] = (uint)in[i+2];
17414 uint * restrict out = (uint* restrict)_out;
17418 (out+j+0)[0] = (uint)in[i+0];
17419 (out+j+0)[1] = (uint)in[i+1];
17420 (out+j+0)[2] = (uint)in[i+3];
17421 (out+j+3)[0] = (uint)in[i+1];
17422 (out+j+3)[1] = (uint)in[i+2];
17423 (out+j+3)[2] = (uint)in[i+3];
17435 uint * restrict out = (uint* restrict)_out;
17439 (out+j+0)[0] = (uint)in[i+2];
17440 (out+j+0)[1] = (uint)in[i+0];
17441 (out+j+0)[2] = (uint)in[i+3];
17442 (out+j+3)[0] = (uint)in[i+0];
17443 (out+j+3)[1] = (uint)in[i+1];
17444 (out+j+3)[2] = (uint)in[i+3];
17456 uint * restrict out = (uint* restrict)_out;
17460 (out+j)[0] = (uint)in[i+1];
17461 (out+j)[1] = (uint)in[i+2];
17462 (out+j)[2] = (uint)in[start];
17474 uint * restrict out = (uint* restrict)_out;
17478 (out+j)[0] = (uint)in[i+0];
17479 (out+j)[1] = (uint)in[i+1];
17480 (out+j)[2] = (uint)in[i+2];
17481 (out+j)[3] = (uint)in[i+3];
17493 uint * restrict out = (uint* restrict)_out;
17497 (out+j)[0] = (uint)in[i+0];
17498 (out+j)[1] = (uint)in[i+1];
17499 (out+j)[2] = (uint)in[i+2];
17500 (out+j)[3] = (uint)in[i+3];
17512 uint * restrict out = (uint* restrict)_out;
17516 (out+j)[0] = (uint)in[i+0];
17517 (out+j)[1] = (uint)in[i+1];
17518 (out+j)[2] = (uint)in[i+2];
17519 (out+j)[3] = (uint)in[i+3];
17520 (out+j)[4] = (uint)in[i+4];
17521 (out+j)[5] = (uint)in[i+5];
17533 uint * restrict out = (uint* restrict)_out;
17539 (out+j)[0] = (uint)in[i+0];
17540 (out+j)[1] = (uint)in[i+1];
17541 (out+j)[2] = (uint)in[i+2];
17542 (out+j)[3] = (uint)in[i+3];
17543 (out+j)[4] = (uint)in[i+4];
17544 (out+j)[5] = (uint)in[i+5];
17547 (out+j)[0] = (uint)in[i+2];
17548 (out+j)[1] = (uint)in[i-2];
17549 (out+j)[2] = (uint)in[i+0];
17550 (out+j)[3] = (uint)in[i+3];
17551 (out+j)[4] = (uint)in[i+4];
17552 (out+j)[5] = (uint)in[i+6];
17565 uint * restrict out = (uint* restrict)_out;
17569 (out+j)[0] = (uint)in[i];
17581 uint * restrict out = (uint* restrict)_out;
17585 (out+j)[0] = (uint)in[i];
17586 (out+j)[1] = (uint)in[i+1];
17598 uint * restrict out = (uint* restrict)_out;
17602 (out+j)[0] = (uint)in[i];
17603 (out+j)[1] = (uint)in[i+1];
17615 uint * restrict out = (uint* restrict)_out;
17622 (out+j+0)[0] = restart_index;
17623 (out+j+0)[1] = restart_index;
17628 (out+j)[0] = (uint)in[end];
17629 (out+j)[1] = (uint)in[start];
17637 (out+j)[0] = (uint)in[end];
17638 (out+j)[1] = (uint)in[start];
17644 (out+j)[0] = (uint)in[i];
17645 (out+j)[1] = (uint)in[i+1];
17648 (out+j)[0] = (uint)in[end];
17649 (out+j)[1] = (uint)in[start];
17660 uint * restrict out = (uint* restrict)_out;
17664 (out+j)[0] = (uint)in[i];
17665 (out+j)[1] = (uint)in[i+1];
17666 (out+j)[2] = (uint)in[i+2];
17678 uint * restrict out = (uint* restrict)_out;
17682 (out+j)[0] = (uint)in[i+(i&1)];
17683 (out+j)[1] = (uint)in[i+1-(i&1)];
17684 (out+j)[2] = (uint)in[i+2];
17696 uint * restrict out = (uint* restrict)_out;
17702 (out+j+0)[0] = restart_index;
17703 (out+j+0)[1] = restart_index;
17704 (out+j+0)[2] = restart_index;
17722 (out+j)[0] = (uint)in[start];
17723 (out+j)[1] = (uint)in[i+1];
17724 (out+j)[2] = (uint)in[i+2];
17736 uint * restrict out = (uint* restrict)_out;
17742 (out+j+0)[0] = restart_index;
17743 (out+j+0)[1] = restart_index;
17744 (out+j+0)[2] = restart_index;
17745 (out+j+3)[0] = restart_index;
17746 (out+j+3)[1] = restart_index;
17747 (out+j+3)[2] = restart_index;
17766 (out+j+0)[0] = (uint)in[i+0];
17767 (out+j+0)[1] = (uint)in[i+1];
17768 (out+j+0)[2] = (uint)in[i+3];
17769 (out+j+3)[0] = (uint)in[i+1];
17770 (out+j+3)[1] = (uint)in[i+2];
17771 (out+j+3)[2] = (uint)in[i+3];
17783 uint * restrict out = (uint* restrict)_out;
17789 (out+j+0)[0] = restart_index;
17790 (out+j+0)[1] = restart_index;
17791 (out+j+0)[2] = restart_index;
17792 (out+j+3)[0] = restart_index;
17793 (out+j+3)[1] = restart_index;
17794 (out+j+3)[2] = restart_index;
17813 (out+j+0)[0] = (uint)in[i+2];
17814 (out+j+0)[1] = (uint)in[i+0];
17815 (out+j+0)[2] = (uint)in[i+3];
17816 (out+j+3)[0] = (uint)in[i+0];
17817 (out+j+3)[1] = (uint)in[i+1];
17818 (out+j+3)[2] = (uint)in[i+3];
17830 uint * restrict out = (uint* restrict)_out;
17836 (out+j+0)[0] = restart_index;
17837 (out+j+0)[1] = restart_index;
17838 (out+j+0)[2] = restart_index;
17856 (out+j)[0] = (uint)in[i+1];
17857 (out+j)[1] = (uint)in[i+2];
17858 (out+j)[2] = (uint)in[start];
17870 uint * restrict out = (uint* restrict)_out;
17874 (out+j)[0] = (uint)in[i+0];
17875 (out+j)[1] = (uint)in[i+1];
17876 (out+j)[2] = (uint)in[i+2];
17877 (out+j)[3] = (uint)in[i+3];
17889 uint * restrict out = (uint* restrict)_out;
17893 (out+j)[0] = (uint)in[i+0];
17894 (out+j)[1] = (uint)in[i+1];
17895 (out+j)[2] = (uint)in[i+2];
17896 (out+j)[3] = (uint)in[i+3];
17908 uint * restrict out = (uint* restrict)_out;
17912 (out+j)[0] = (uint)in[i+0];
17913 (out+j)[1] = (uint)in[i+1];
17914 (out+j)[2] = (uint)in[i+2];
17915 (out+j)[3] = (uint)in[i+3];
17916 (out+j)[4] = (uint)in[i+4];
17917 (out+j)[5] = (uint)in[i+5];
17929 uint * restrict out = (uint* restrict)_out;
17935 (out+j)[0] = (uint)in[i+0];
17936 (out+j)[1] = (uint)in[i+1];
17937 (out+j)[2] = (uint)in[i+2];
17938 (out+j)[3] = (uint)in[i+3];
17939 (out+j)[4] = (uint)in[i+4];
17940 (out+j)[5] = (uint)in[i+5];
17943 (out+j)[0] = (uint)in[i+2];
17944 (out+j)[1] = (uint)in[i-2];
17945 (out+j)[2] = (uint)in[i+0];
17946 (out+j)[3] = (uint)in[i+3];
17947 (out+j)[4] = (uint)in[i+4];
17948 (out+j)[5] = (uint)in[i+6];