Lines Matching refs:out
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
69 ushort *out = (ushort*)_out;
73 (out+j)[0] = (ushort)(i);
81 ushort *out = (ushort*)_out;
85 (out+j)[0] = (ushort)(i);
86 (out+j)[1] = (ushort)(i+1);
94 ushort *out = (ushort*)_out;
98 (out+j)[0] = (ushort)(i);
99 (out+j)[1] = (ushort)(i+1);
107 ushort *out = (ushort*)_out;
111 (out+j)[0] = (ushort)(i);
112 (out+j)[1] = (ushort)(i+1);
114 (out+j)[0] = (ushort)(i);
115 (out+j)[1] = (ushort)(start);
122 ushort *out = (ushort*)_out;
126 (out+j)[0] = (ushort)(i);
127 (out+j)[1] = (ushort)(i+1);
128 (out+j)[2] = (ushort)(i+2);
136 ushort *out = (ushort*)_out;
140 (out+j)[0] = (ushort)(i);
141 (out+j)[1] = (ushort)(i+1+(i&1));
142 (out+j)[2] = (ushort)(i+2-(i&1));
150 ushort *out = (ushort*)_out;
154 (out+j)[0] = (ushort)(start);
155 (out+j)[1] = (ushort)(i+1);
156 (out+j)[2] = (ushort)(i+2);
164 ushort *out = (ushort*)_out;
168 (out+j+0)[0] = (ushort)(i+0);
169 (out+j+0)[1] = (ushort)(i+1);
170 (out+j+0)[2] = (ushort)(i+2);
171 (out+j+3)[0] = (ushort)(i+0);
172 (out+j+3)[1] = (ushort)(i+2);
173 (out+j+3)[2] = (ushort)(i+3);
181 ushort *out = (ushort*)_out;
185 (out+j+0)[0] = (ushort)(i+0);
186 (out+j+0)[1] = (ushort)(i+1);
187 (out+j+0)[2] = (ushort)(i+3);
188 (out+j+3)[0] = (ushort)(i+0);
189 (out+j+3)[1] = (ushort)(i+3);
190 (out+j+3)[2] = (ushort)(i+2);
198 ushort *out = (ushort*)_out;
202 (out+j)[0] = (ushort)(start);
203 (out+j)[1] = (ushort)(i+1);
204 (out+j)[2] = (ushort)(i+2);
212 ushort *out = (ushort*)_out;
216 (out+j)[0] = (ushort)(i+0);
217 (out+j)[1] = (ushort)(i+1);
218 (out+j)[2] = (ushort)(i+2);
219 (out+j)[3] = (ushort)(i+3);
227 ushort *out = (ushort*)_out;
231 (out+j)[0] = (ushort)(i+0);
232 (out+j)[1] = (ushort)(i+1);
233 (out+j)[2] = (ushort)(i+2);
234 (out+j)[3] = (ushort)(i+3);
242 ushort *out = (ushort*)_out;
246 (out+j)[0] = (ushort)(i+0);
247 (out+j)[1] = (ushort)(i+1);
248 (out+j)[2] = (ushort)(i+2);
249 (out+j)[3] = (ushort)(i+3);
250 (out+j)[4] = (ushort)(i+4);
251 (out+j)[5] = (ushort)(i+5);
259 ushort *out = (ushort*)_out;
265 (out+j)[0] = (ushort)(i+0);
266 (out+j)[1] = (ushort)(i+1);
267 (out+j)[2] = (ushort)(i+2);
268 (out+j)[3] = (ushort)(i+3);
269 (out+j)[4] = (ushort)(i+4);
270 (out+j)[5] = (ushort)(i+5);
273 (out+j)[0] = (ushort)(i+2);
274 (out+j)[1] = (ushort)(i-2);
275 (out+j)[2] = (ushort)(i+0);
276 (out+j)[3] = (ushort)(i+3);
277 (out+j)[4] = (ushort)(i+4);
278 (out+j)[5] = (ushort)(i+6);
287 ushort *out = (ushort*)_out;
291 (out+j)[0] = (ushort)(i);
299 ushort *out = (ushort*)_out;
303 (out+j)[0] = (ushort)(i+1);
304 (out+j)[1] = (ushort)(i);
312 ushort *out = (ushort*)_out;
316 (out+j)[0] = (ushort)(i+1);
317 (out+j)[1] = (ushort)(i);
325 ushort *out = (ushort*)_out;
329 (out+j)[0] = (ushort)(i+1);
330 (out+j)[1] = (ushort)(i);
332 (out+j)[0] = (ushort)(start);
333 (out+j)[1] = (ushort)(i);
340 ushort *out = (ushort*)_out;
344 (out+j)[0] = (ushort)(i+1);
345 (out+j)[1] = (ushort)(i+2);
346 (out+j)[2] = (ushort)(i);
354 ushort *out = (ushort*)_out;
358 (out+j)[0] = (ushort)(i+1+(i&1));
359 (out+j)[1] = (ushort)(i+2-(i&1));
360 (out+j)[2] = (ushort)(i);
368 ushort *out = (ushort*)_out;
372 (out+j)[0] = (ushort)(i+1);
373 (out+j)[1] = (ushort)(i+2);
374 (out+j)[2] = (ushort)(start);
382 ushort *out = (ushort*)_out;
386 (out+j+0)[0] = (ushort)(i+1);
387 (out+j+0)[1] = (ushort)(i+2);
388 (out+j+0)[2] = (ushort)(i+0);
389 (out+j+3)[0] = (ushort)(i+2);
390 (out+j+3)[1] = (ushort)(i+3);
391 (out+j+3)[2] = (ushort)(i+0);
399 ushort *out = (ushort*)_out;
403 (out+j+0)[0] = (ushort)(i+1);
404 (out+j+0)[1] = (ushort)(i+3);
405 (out+j+0)[2] = (ushort)(i+0);
406 (out+j+3)[0] = (ushort)(i+3);
407 (out+j+3)[1] = (ushort)(i+2);
408 (out+j+3)[2] = (ushort)(i+0);
416 ushort *out = (ushort*)_out;
420 (out+j)[0] = (ushort)(i+1);
421 (out+j)[1] = (ushort)(i+2);
422 (out+j)[2] = (ushort)(start);
430 ushort *out = (ushort*)_out;
434 (out+j)[0] = (ushort)(i+3);
435 (out+j)[1] = (ushort)(i+2);
436 (out+j)[2] = (ushort)(i+1);
437 (out+j)[3] = (ushort)(i+0);
445 ushort *out = (ushort*)_out;
449 (out+j)[0] = (ushort)(i+3);
450 (out+j)[1] = (ushort)(i+2);
451 (out+j)[2] = (ushort)(i+1);
452 (out+j)[3] = (ushort)(i+0);
460 ushort *out = (ushort*)_out;
464 (out+j)[0] = (ushort)(i+4);
465 (out+j)[1] = (ushort)(i+5);
466 (out+j)[2] = (ushort)(i+0);
467 (out+j)[3] = (ushort)(i+1);
468 (out+j)[4] = (ushort)(i+2);
469 (out+j)[5] = (ushort)(i+3);
477 ushort *out = (ushort*)_out;
483 (out+j)[0] = (ushort)(i+4);
484 (out+j)[1] = (ushort)(i+5);
485 (out+j)[2] = (ushort)(i+0);
486 (out+j)[3] = (ushort)(i+1);
487 (out+j)[4] = (ushort)(i+2);
488 (out+j)[5] = (ushort)(i+3);
491 (out+j)[0] = (ushort)(i+4);
492 (out+j)[1] = (ushort)(i+6);
493 (out+j)[2] = (ushort)(i+2);
494 (out+j)[3] = (ushort)(i-2);
495 (out+j)[4] = (ushort)(i+0);
496 (out+j)[5] = (ushort)(i+3);
505 ushort *out = (ushort*)_out;
509 (out+j)[0] = (ushort)(i);
517 ushort *out = (ushort*)_out;
521 (out+j)[0] = (ushort)(i+1);
522 (out+j)[1] = (ushort)(i);
530 ushort *out = (ushort*)_out;
534 (out+j)[0] = (ushort)(i+1);
535 (out+j)[1] = (ushort)(i);
543 ushort *out = (ushort*)_out;
547 (out+j)[0] = (ushort)(i+1);
548 (out+j)[1] = (ushort)(i);
550 (out+j)[0] = (ushort)(start);
551 (out+j)[1] = (ushort)(i);
558 ushort *out = (ushort*)_out;
562 (out+j)[0] = (ushort)(i+2);
563 (out+j)[1] = (ushort)(i);
564 (out+j)[2] = (ushort)(i+1);
572 ushort *out = (ushort*)_out;
576 (out+j)[0] = (ushort)(i+2);
577 (out+j)[1] = (ushort)(i+(i&1));
578 (out+j)[2] = (ushort)(i+1-(i&1));
586 ushort *out = (ushort*)_out;
590 (out+j)[0] = (ushort)(i+2);
591 (out+j)[1] = (ushort)(start);
592 (out+j)[2] = (ushort)(i+1);
600 ushort *out = (ushort*)_out;
604 (out+j+0)[0] = (ushort)(i+3);
605 (out+j+0)[1] = (ushort)(i+0);
606 (out+j+0)[2] = (ushort)(i+1);
607 (out+j+3)[0] = (ushort)(i+3);
608 (out+j+3)[1] = (ushort)(i+1);
609 (out+j+3)[2] = (ushort)(i+2);
617 ushort *out = (ushort*)_out;
621 (out+j+0)[0] = (ushort)(i+3);
622 (out+j+0)[1] = (ushort)(i+2);
623 (out+j+0)[2] = (ushort)(i+0);
624 (out+j+3)[0] = (ushort)(i+3);
625 (out+j+3)[1] = (ushort)(i+0);
626 (out+j+3)[2] = (ushort)(i+1);
634 ushort *out = (ushort*)_out;
638 (out+j)[0] = (ushort)(start);
639 (out+j)[1] = (ushort)(i+1);
640 (out+j)[2] = (ushort)(i+2);
648 ushort *out = (ushort*)_out;
652 (out+j)[0] = (ushort)(i+3);
653 (out+j)[1] = (ushort)(i+2);
654 (out+j)[2] = (ushort)(i+1);
655 (out+j)[3] = (ushort)(i+0);
663 ushort *out = (ushort*)_out;
667 (out+j)[0] = (ushort)(i+3);
668 (out+j)[1] = (ushort)(i+2);
669 (out+j)[2] = (ushort)(i+1);
670 (out+j)[3] = (ushort)(i+0);
678 ushort *out = (ushort*)_out;
682 (out+j)[0] = (ushort)(i+4);
683 (out+j)[1] = (ushort)(i+5);
684 (out+j)[2] = (ushort)(i+0);
685 (out+j)[3] = (ushort)(i+1);
686 (out+j)[4] = (ushort)(i+2);
687 (out+j)[5] = (ushort)(i+3);
695 ushort *out = (ushort*)_out;
701 (out+j)[0] = (ushort)(i+4);
702 (out+j)[1] = (ushort)(i+5);
703 (out+j)[2] = (ushort)(i+0);
704 (out+j)[3] = (ushort)(i+1);
705 (out+j)[4] = (ushort)(i+2);
706 (out+j)[5] = (ushort)(i+3);
709 (out+j)[0] = (ushort)(i+4);
710 (out+j)[1] = (ushort)(i+6);
711 (out+j)[2] = (ushort)(i+2);
712 (out+j)[3] = (ushort)(i-2);
713 (out+j)[4] = (ushort)(i+0);
714 (out+j)[5] = (ushort)(i+3);
723 ushort *out = (ushort*)_out;
727 (out+j)[0] = (ushort)(i);
735 ushort *out = (ushort*)_out;
739 (out+j)[0] = (ushort)(i);
740 (out+j)[1] = (ushort)(i+1);
748 ushort *out = (ushort*)_out;
752 (out+j)[0] = (ushort)(i);
753 (out+j)[1] = (ushort)(i+1);
761 ushort *out = (ushort*)_out;
765 (out+j)[0] = (ushort)(i);
766 (out+j)[1] = (ushort)(i+1);
768 (out+j)[0] = (ushort)(i);
769 (out+j)[1] = (ushort)(start);
776 ushort *out = (ushort*)_out;
780 (out+j)[0] = (ushort)(i);
781 (out+j)[1] = (ushort)(i+1);
782 (out+j)[2] = (ushort)(i+2);
790 ushort *out = (ushort*)_out;
794 (out+j)[0] = (ushort)(i+(i&1));
795 (out+j)[1] = (ushort)(i+1-(i&1));
796 (out+j)[2] = (ushort)(i+2);
804 ushort *out = (ushort*)_out;
808 (out+j)[0] = (ushort)(start);
809 (out+j)[1] = (ushort)(i+1);
810 (out+j)[2] = (ushort)(i+2);
818 ushort *out = (ushort*)_out;
822 (out+j+0)[0] = (ushort)(i+0);
823 (out+j+0)[1] = (ushort)(i+1);
824 (out+j+0)[2] = (ushort)(i+3);
825 (out+j+3)[0] = (ushort)(i+1);
826 (out+j+3)[1] = (ushort)(i+2);
827 (out+j+3)[2] = (ushort)(i+3);
835 ushort *out = (ushort*)_out;
839 (out+j+0)[0] = (ushort)(i+2);
840 (out+j+0)[1] = (ushort)(i+0);
841 (out+j+0)[2] = (ushort)(i+3);
842 (out+j+3)[0] = (ushort)(i+0);
843 (out+j+3)[1] = (ushort)(i+1);
844 (out+j+3)[2] = (ushort)(i+3);
852 ushort *out = (ushort*)_out;
856 (out+j)[0] = (ushort)(i+1);
857 (out+j)[1] = (ushort)(i+2);
858 (out+j)[2] = (ushort)(start);
866 ushort *out = (ushort*)_out;
870 (out+j)[0] = (ushort)(i+0);
871 (out+j)[1] = (ushort)(i+1);
872 (out+j)[2] = (ushort)(i+2);
873 (out+j)[3] = (ushort)(i+3);
881 ushort *out = (ushort*)_out;
885 (out+j)[0] = (ushort)(i+0);
886 (out+j)[1] = (ushort)(i+1);
887 (out+j)[2] = (ushort)(i+2);
888 (out+j)[3] = (ushort)(i+3);
896 ushort *out = (ushort*)_out;
900 (out+j)[0] = (ushort)(i+0);
901 (out+j)[1] = (ushort)(i+1);
902 (out+j)[2] = (ushort)(i+2);
903 (out+j)[3] = (ushort)(i+3);
904 (out+j)[4] = (ushort)(i+4);
905 (out+j)[5] = (ushort)(i+5);
913 ushort *out = (ushort*)_out;
919 (out+j)[0] = (ushort)(i+0);
920 (out+j)[1] = (ushort)(i+1);
921 (out+j)[2] = (ushort)(i+2);
922 (out+j)[3] = (ushort)(i+3);
923 (out+j)[4] = (ushort)(i+4);
924 (out+j)[5] = (ushort)(i+5);
927 (out+j)[0] = (ushort)(i+2);
928 (out+j)[1] = (ushort)(i-2);
929 (out+j)[2] = (ushort)(i+0);
930 (out+j)[3] = (ushort)(i+3);
931 (out+j)[4] = (ushort)(i+4);
932 (out+j)[5] = (ushort)(i+6);
941 uint *out = (uint*)_out;
945 (out+j)[0] = (uint)(i);
953 uint *out = (uint*)_out;
957 (out+j)[0] = (uint)(i);
958 (out+j)[1] = (uint)(i+1);
966 uint *out = (uint*)_out;
970 (out+j)[0] = (uint)(i);
971 (out+j)[1] = (uint)(i+1);
979 uint *out = (uint*)_out;
983 (out+j)[0] = (uint)(i);
984 (out+j)[1] = (uint)(i+1);
986 (out+j)[0] = (uint)(i);
987 (out+j)[1] = (uint)(start);
994 uint *out = (uint*)_out;
998 (out+j)[0] = (uint)(i);
999 (out+j)[1] = (uint)(i+1);
1000 (out+j)[2] = (uint)(i+2);
1008 uint *out = (uint*)_out;
1012 (out+j)[0] = (uint)(i);
1013 (out+j)[1] = (uint)(i+1+(i&1));
1014 (out+j)[2] = (uint)(i+2-(i&1));
1022 uint *out = (uint*)_out;
1026 (out+j)[0] = (uint)(start);
1027 (out+j)[1] = (uint)(i+1);
1028 (out+j)[2] = (uint)(i+2);
1036 uint *out = (uint*)_out;
1040 (out+j+0)[0] = (uint)(i+0);
1041 (out+j+0)[1] = (uint)(i+1);
1042 (out+j+0)[2] = (uint)(i+2);
1043 (out+j+3)[0] = (uint)(i+0);
1044 (out+j+3)[1] = (uint)(i+2);
1045 (out+j+3)[2] = (uint)(i+3);
1053 uint *out = (uint*)_out;
1057 (out+j+0)[0] = (uint)(i+0);
1058 (out+j+0)[1] = (uint)(i+1);
1059 (out+j+0)[2] = (uint)(i+3);
1060 (out+j+3)[0] = (uint)(i+0);
1061 (out+j+3)[1] = (uint)(i+3);
1062 (out+j+3)[2] = (uint)(i+2);
1070 uint *out = (uint*)_out;
1074 (out+j)[0] = (uint)(start);
1075 (out+j)[1] = (uint)(i+1);
1076 (out+j)[2] = (uint)(i+2);
1084 uint *out = (uint*)_out;
1088 (out+j)[0] = (uint)(i+0);
1089 (out+j)[1] = (uint)(i+1);
1090 (out+j)[2] = (uint)(i+2);
1091 (out+j)[3] = (uint)(i+3);
1099 uint *out = (uint*)_out;
1103 (out+j)[0] = (uint)(i+0);
1104 (out+j)[1] = (uint)(i+1);
1105 (out+j)[2] = (uint)(i+2);
1106 (out+j)[3] = (uint)(i+3);
1114 uint *out = (uint*)_out;
1118 (out+j)[0] = (uint)(i+0);
1119 (out+j)[1] = (uint)(i+1);
1120 (out+j)[2] = (uint)(i+2);
1121 (out+j)[3] = (uint)(i+3);
1122 (out+j)[4] = (uint)(i+4);
1123 (out+j)[5] = (uint)(i+5);
1131 uint *out = (uint*)_out;
1137 (out+j)[0] = (uint)(i+0);
1138 (out+j)[1] = (uint)(i+1);
1139 (out+j)[2] = (uint)(i+2);
1140 (out+j)[3] = (uint)(i+3);
1141 (out+j)[4] = (uint)(i+4);
1142 (out+j)[5] = (uint)(i+5);
1145 (out+j)[0] = (uint)(i+2);
1146 (out+j)[1] = (uint)(i-2);
1147 (out+j)[2] = (uint)(i+0);
1148 (out+j)[3] = (uint)(i+3);
1149 (out+j)[4] = (uint)(i+4);
1150 (out+j)[5] = (uint)(i+6);
1159 uint *out = (uint*)_out;
1163 (out+j)[0] = (uint)(i);
1171 uint *out = (uint*)_out;
1175 (out+j)[0] = (uint)(i+1);
1176 (out+j)[1] = (uint)(i);
1184 uint *out = (uint*)_out;
1188 (out+j)[0] = (uint)(i+1);
1189 (out+j)[1] = (uint)(i);
1197 uint *out = (uint*)_out;
1201 (out+j)[0] = (uint)(i+1);
1202 (out+j)[1] = (uint)(i);
1204 (out+j)[0] = (uint)(start);
1205 (out+j)[1] = (uint)(i);
1212 uint *out = (uint*)_out;
1216 (out+j)[0] = (uint)(i+1);
1217 (out+j)[1] = (uint)(i+2);
1218 (out+j)[2] = (uint)(i);
1226 uint *out = (uint*)_out;
1230 (out+j)[0] = (uint)(i+1+(i&1));
1231 (out+j)[1] = (uint)(i+2-(i&1));
1232 (out+j)[2] = (uint)(i);
1240 uint *out = (uint*)_out;
1244 (out+j)[0] = (uint)(i+1);
1245 (out+j)[1] = (uint)(i+2);
1246 (out+j)[2] = (uint)(start);
1254 uint *out = (uint*)_out;
1258 (out+j+0)[0] = (uint)(i+1);
1259 (out+j+0)[1] = (uint)(i+2);
1260 (out+j+0)[2] = (uint)(i+0);
1261 (out+j+3)[0] = (uint)(i+2);
1262 (out+j+3)[1] = (uint)(i+3);
1263 (out+j+3)[2] = (uint)(i+0);
1271 uint *out = (uint*)_out;
1275 (out+j+0)[0] = (uint)(i+1);
1276 (out+j+0)[1] = (uint)(i+3);
1277 (out+j+0)[2] = (uint)(i+0);
1278 (out+j+3)[0] = (uint)(i+3);
1279 (out+j+3)[1] = (uint)(i+2);
1280 (out+j+3)[2] = (uint)(i+0);
1288 uint *out = (uint*)_out;
1292 (out+j)[0] = (uint)(i+1);
1293 (out+j)[1] = (uint)(i+2);
1294 (out+j)[2] = (uint)(start);
1302 uint *out = (uint*)_out;
1306 (out+j)[0] = (uint)(i+3);
1307 (out+j)[1] = (uint)(i+2);
1308 (out+j)[2] = (uint)(i+1);
1309 (out+j)[3] = (uint)(i+0);
1317 uint *out = (uint*)_out;
1321 (out+j)[0] = (uint)(i+3);
1322 (out+j)[1] = (uint)(i+2);
1323 (out+j)[2] = (uint)(i+1);
1324 (out+j)[3] = (uint)(i+0);
1332 uint *out = (uint*)_out;
1336 (out+j)[0] = (uint)(i+4);
1337 (out+j)[1] = (uint)(i+5);
1338 (out+j)[2] = (uint)(i+0);
1339 (out+j)[3] = (uint)(i+1);
1340 (out+j)[4] = (uint)(i+2);
1341 (out+j)[5] = (uint)(i+3);
1349 uint *out = (uint*)_out;
1355 (out+j)[0] = (uint)(i+4);
1356 (out+j)[1] = (uint)(i+5);
1357 (out+j)[2] = (uint)(i+0);
1358 (out+j)[3] = (uint)(i+1);
1359 (out+j)[4] = (uint)(i+2);
1360 (out+j)[5] = (uint)(i+3);
1363 (out+j)[0] = (uint)(i+4);
1364 (out+j)[1] = (uint)(i+6);
1365 (out+j)[2] = (uint)(i+2);
1366 (out+j)[3] = (uint)(i-2);
1367 (out+j)[4] = (uint)(i+0);
1368 (out+j)[5] = (uint)(i+3);
1377 uint *out = (uint*)_out;
1381 (out+j)[0] = (uint)(i);
1389 uint *out = (uint*)_out;
1393 (out+j)[0] = (uint)(i+1);
1394 (out+j)[1] = (uint)(i);
1402 uint *out = (uint*)_out;
1406 (out+j)[0] = (uint)(i+1);
1407 (out+j)[1] = (uint)(i);
1415 uint *out = (uint*)_out;
1419 (out+j)[0] = (uint)(i+1);
1420 (out+j)[1] = (uint)(i);
1422 (out+j)[0] = (uint)(start);
1423 (out+j)[1] = (uint)(i);
1430 uint *out = (uint*)_out;
1434 (out+j)[0] = (uint)(i+2);
1435 (out+j)[1] = (uint)(i);
1436 (out+j)[2] = (uint)(i+1);
1444 uint *out = (uint*)_out;
1448 (out+j)[0] = (uint)(i+2);
1449 (out+j)[1] = (uint)(i+(i&1));
1450 (out+j)[2] = (uint)(i+1-(i&1));
1458 uint *out = (uint*)_out;
1462 (out+j)[0] = (uint)(i+2);
1463 (out+j)[1] = (uint)(start);
1464 (out+j)[2] = (uint)(i+1);
1472 uint *out = (uint*)_out;
1476 (out+j+0)[0] = (uint)(i+3);
1477 (out+j+0)[1] = (uint)(i+0);
1478 (out+j+0)[2] = (uint)(i+1);
1479 (out+j+3)[0] = (uint)(i+3);
1480 (out+j+3)[1] = (uint)(i+1);
1481 (out+j+3)[2] = (uint)(i+2);
1489 uint *out = (uint*)_out;
1493 (out+j+0)[0] = (uint)(i+3);
1494 (out+j+0)[1] = (uint)(i+2);
1495 (out+j+0)[2] = (uint)(i+0);
1496 (out+j+3)[0] = (uint)(i+3);
1497 (out+j+3)[1] = (uint)(i+0);
1498 (out+j+3)[2] = (uint)(i+1);
1506 uint *out = (uint*)_out;
1510 (out+j)[0] = (uint)(start);
1511 (out+j)[1] = (uint)(i+1);
1512 (out+j)[2] = (uint)(i+2);
1520 uint *out = (uint*)_out;
1524 (out+j)[0] = (uint)(i+3);
1525 (out+j)[1] = (uint)(i+2);
1526 (out+j)[2] = (uint)(i+1);
1527 (out+j)[3] = (uint)(i+0);
1535 uint *out = (uint*)_out;
1539 (out+j)[0] = (uint)(i+3);
1540 (out+j)[1] = (uint)(i+2);
1541 (out+j)[2] = (uint)(i+1);
1542 (out+j)[3] = (uint)(i+0);
1550 uint *out = (uint*)_out;
1554 (out+j)[0] = (uint)(i+4);
1555 (out+j)[1] = (uint)(i+5);
1556 (out+j)[2] = (uint)(i+0);
1557 (out+j)[3] = (uint)(i+1);
1558 (out+j)[4] = (uint)(i+2);
1559 (out+j)[5] = (uint)(i+3);
1567 uint *out = (uint*)_out;
1573 (out+j)[0] = (uint)(i+4);
1574 (out+j)[1] = (uint)(i+5);
1575 (out+j)[2] = (uint)(i+0);
1576 (out+j)[3] = (uint)(i+1);
1577 (out+j)[4] = (uint)(i+2);
1578 (out+j)[5] = (uint)(i+3);
1581 (out+j)[0] = (uint)(i+4);
1582 (out+j)[1] = (uint)(i+6);
1583 (out+j)[2] = (uint)(i+2);
1584 (out+j)[3] = (uint)(i-2);
1585 (out+j)[4] = (uint)(i+0);
1586 (out+j)[5] = (uint)(i+3);
1595 uint *out = (uint*)_out;
1599 (out+j)[0] = (uint)(i);
1607 uint *out = (uint*)_out;
1611 (out+j)[0] = (uint)(i);
1612 (out+j)[1] = (uint)(i+1);
1620 uint *out = (uint*)_out;
1624 (out+j)[0] = (uint)(i);
1625 (out+j)[1] = (uint)(i+1);
1633 uint *out = (uint*)_out;
1637 (out+j)[0] = (uint)(i);
1638 (out+j)[1] = (uint)(i+1);
1640 (out+j)[0] = (uint)(i);
1641 (out+j)[1] = (uint)(start);
1648 uint *out = (uint*)_out;
1652 (out+j)[0] = (uint)(i);
1653 (out+j)[1] = (uint)(i+1);
1654 (out+j)[2] = (uint)(i+2);
1662 uint *out = (uint*)_out;
1666 (out+j)[0] = (uint)(i+(i&1));
1667 (out+j)[1] = (uint)(i+1-(i&1));
1668 (out+j)[2] = (uint)(i+2);
1676 uint *out = (uint*)_out;
1680 (out+j)[0] = (uint)(start);
1681 (out+j)[1] = (uint)(i+1);
1682 (out+j)[2] = (uint)(i+2);
1690 uint *out = (uint*)_out;
1694 (out+j+0)[0] = (uint)(i+0);
1695 (out+j+0)[1] = (uint)(i+1);
1696 (out+j+0)[2] = (uint)(i+3);
1697 (out+j+3)[0] = (uint)(i+1);
1698 (out+j+3)[1] = (uint)(i+2);
1699 (out+j+3)[2] = (uint)(i+3);
1707 uint *out = (uint*)_out;
1711 (out+j+0)[0] = (uint)(i+2);
1712 (out+j+0)[1] = (uint)(i+0);
1713 (out+j+0)[2] = (uint)(i+3);
1714 (out+j+3)[0] = (uint)(i+0);
1715 (out+j+3)[1] = (uint)(i+1);
1716 (out+j+3)[2] = (uint)(i+3);
1724 uint *out = (uint*)_out;
1728 (out+j)[0] = (uint)(i+1);
1729 (out+j)[1] = (uint)(i+2);
1730 (out+j)[2] = (uint)(start);
1738 uint *out = (uint*)_out;
1742 (out+j)[0] = (uint)(i+0);
1743 (out+j)[1] = (uint)(i+1);
1744 (out+j)[2] = (uint)(i+2);
1745 (out+j)[3] = (uint)(i+3);
1753 uint *out = (uint*)_out;
1757 (out+j)[0] = (uint)(i+0);
1758 (out+j)[1] = (uint)(i+1);
1759 (out+j)[2] = (uint)(i+2);
1760 (out+j)[3] = (uint)(i+3);
1768 uint *out = (uint*)_out;
1772 (out+j)[0] = (uint)(i+0);
1773 (out+j)[1] = (uint)(i+1);
1774 (out+j)[2] = (uint)(i+2);
1775 (out+j)[3] = (uint)(i+3);
1776 (out+j)[4] = (uint)(i+4);
1777 (out+j)[5] = (uint)(i+5);
1785 uint *out = (uint*)_out;
1791 (out+j)[0] = (uint)(i+0);
1792 (out+j)[1] = (uint)(i+1);
1793 (out+j)[2] = (uint)(i+2);
1794 (out+j)[3] = (uint)(i+3);
1795 (out+j)[4] = (uint)(i+4);
1796 (out+j)[5] = (uint)(i+5);
1799 (out+j)[0] = (uint)(i+2);
1800 (out+j)[1] = (uint)(i-2);
1801 (out+j)[2] = (uint)(i+0);
1802 (out+j)[3] = (uint)(i+3);
1803 (out+j)[4] = (uint)(i+4);
1804 (out+j)[5] = (uint)(i+6);
1817 ushort *out = (ushort*)_out;
1821 (out+j)[0] = (ushort)in[i];
1833 ushort *out = (ushort*)_out;
1837 (out+j)[0] = (ushort)in[i];
1838 (out+j)[1] = (ushort)in[i+1];
1850 ushort *out = (ushort*)_out;
1854 (out+j)[0] = (ushort)in[i];
1855 (out+j)[1] = (ushort)in[i+1];
1867 ushort *out = (ushort*)_out;
1871 (out+j)[0] = (ushort)in[i];
1872 (out+j)[1] = (ushort)in[i+1];
1874 (out+j)[0] = (ushort)in[i];
1875 (out+j)[1] = (ushort)in[start];
1886 ushort *out = (ushort*)_out;
1890 (out+j)[0] = (ushort)in[i];
1891 (out+j)[1] = (ushort)in[i+1];
1892 (out+j)[2] = (ushort)in[i+2];
1904 ushort *out = (ushort*)_out;
1908 (out+j)[0] = (ushort)in[i];
1909 (out+j)[1] = (ushort)in[i+1+(i&1)];
1910 (out+j)[2] = (ushort)in[i+2-(i&1)];
1922 ushort *out = (ushort*)_out;
1926 (out+j)[0] = (ushort)in[start];
1927 (out+j)[1] = (ushort)in[i+1];
1928 (out+j)[2] = (ushort)in[i+2];
1940 ushort *out = (ushort*)_out;
1944 (out+j+0)[0] = (ushort)in[i+0];
1945 (out+j+0)[1] = (ushort)in[i+1];
1946 (out+j+0)[2] = (ushort)in[i+2];
1947 (out+j+3)[0] = (ushort)in[i+0];
1948 (out+j+3)[1] = (ushort)in[i+2];
1949 (out+j+3)[2] = (ushort)in[i+3];
1961 ushort *out = (ushort*)_out;
1965 (out+j+0)[0] = (ushort)in[i+0];
1966 (out+j+0)[1] = (ushort)in[i+1];
1967 (out+j+0)[2] = (ushort)in[i+3];
1968 (out+j+3)[0] = (ushort)in[i+0];
1969 (out+j+3)[1] = (ushort)in[i+3];
1970 (out+j+3)[2] = (ushort)in[i+2];
1982 ushort *out = (ushort*)_out;
1986 (out+j)[0] = (ushort)in[start];
1987 (out+j)[1] = (ushort)in[i+1];
1988 (out+j)[2] = (ushort)in[i+2];
2000 ushort *out = (ushort*)_out;
2004 (out+j)[0] = (ushort)in[i+0];
2005 (out+j)[1] = (ushort)in[i+1];
2006 (out+j)[2] = (ushort)in[i+2];
2007 (out+j)[3] = (ushort)in[i+3];
2019 ushort *out = (ushort*)_out;
2023 (out+j)[0] = (ushort)in[i+0];
2024 (out+j)[1] = (ushort)in[i+1];
2025 (out+j)[2] = (ushort)in[i+2];
2026 out+j)[3] = (ushort)in[i+3];
2038 ushort *out = (ushort*)_out;
2042 (out+j)[0] = (ushort)in[i+0];
2043 (out+j)[1] = (ushort)in[i+1];
2044 (out+j)[2] = (ushort)in[i+2];
2045 (out+j)[3] = (ushort)in[i+3];
2046 (out+j)[4] = (ushort)in[i+4];
2047 (out+j)[5] = (ushort)in[i+5];
2059 ushort *out = (ushort*)_out;
2065 (out+j)[0] = (ushort)in[i+0];
2066 (out+j)[1] = (ushort)in[i+1];
2067 (out+j)[2] = (ushort)in[i+2];
2068 (out+j)[3] = (ushort)in[i+3];
2069 (out+j)[4] = (ushort)in[i+4];
2070 (out+j)[5] = (ushort)in[i+5];
2073 (out+j)[0] = (ushort)in[i+2];
2074 (out+j)[1] = (ushort)in[i-2];
2075 (out+j)[2] = (ushort)in[i+0];
2076 (out+j)[3] = (ushort)in[i+3];
2077 (out+j)[4] = (ushort)in[i+4];
2078 (out+j)[5] = (ushort)in[i+6];
2091 ushort *out = (ushort*)_out;
2095 (out+j)[0] = (ushort)in[i];
2107 ushort *out = (ushort*)_out;
2111 (out+j)[0] = (ushort)in[i];
2112 (out+j)[1] = (ushort)in[i+1];
2124 ushort *out = (ushort*)_out;
2128 (out+j)[0] = (ushort)in[i];
2129 (out+j)[1] = (ushort)in[i+1];
2141 ushort *out = (ushort*)_out;
2145 (out+j)[0] = (ushort)in[i];
2146 (out+j)[1] = (ushort)in[i+1];
2148 (out+j)[0] = (ushort)in[i];
2149 (out+j)[1] = (ushort)in[start];
2160 ushort *out = (ushort*)_out;
2164 (out+j)[0] = (ushort)in[i];
2165 (out+j)[1] = (ushort)in[i+1];
2166 (out+j)[2] = (ushort)in[i+2];
2178 ushort *out = (ushort*)_out;
2182 (out+j)[0] = (ushort)in[i];
2183 (out+j)[1] = (ushort)in[i+1+(i&1)];
2184 (out+j)[2] = (ushort)in[i+2-(i&1)];
2196 ushort *out = (ushort*)_out;
2200 (out+j)[0] = (ushort)in[start];
2201 (out+j)[1] = (ushort)in[i+1];
2202 (out+j)[2] = (ushort)in[i+2];
2214 ushort *out = (ushort*)_out;
2220 (out+j+0)[0] = restart_index;
2221 (out+j+0)[1] = restart_index;
2222 (out+j+0)[2] = restart_index;
2223 (out+j+3)[0] = restart_index;
2224 (out+j+3)[1] = restart_index;
2225 (out+j+3)[2] = restart_index;
2244 (out+j+0)[0] = (ushort)in[i+0];
2245 (out+j+0)[1] = (ushort)in[i+1];
2246 (out+j+0)[2] = (ushort)in[i+2];
2247 (out+j+3)[0] = (ushort)in[i+0];
2248 (out+j+3)[1] = (ushort)in[i+2];
2249 (out+j+3)[2] = (ushort)in[i+3];
2261 ushort *out = (ushort*)_out;
2267 (out+j+0)[0] = restart_index;
2268 (out+j+0)[1] = restart_index;
2269 (out+j+0)[2] = restart_index;
2270 (out+j+3)[0] = restart_index;
2271 (out+j+3)[1] = restart_index;
2272 (out+j+3)[2] = restart_index;
2291 (out+j+0)[0] = (ushort)in[i+0];
2292 (out+j+0)[1] = (ushort)in[i+1];
2293 (out+j+0)[2] = (ushort)in[i+3];
2294 (out+j+3)[0] = (ushort)in[i+0];
2295 (out+j+3)[1] = (ushort)in[i+3];
2296 (out+j+3)[2] = (ushort)in[i+2];
2308 ushort *out = (ushort*)_out;
2314 (out+j+0)[0] = restart_index;
2315 (out+j+0)[1] = restart_index;
2316 (out+j+0)[2] = restart_index;
2334 (out+j)[0] = (ushort)in[start];
2335 (out+j)[1] = (ushort)in[i+1];
2336 (out+j)[2] = (ushort)in[i+2];
2348 ushort *out = (ushort*)_out;
2352 (out+j)[0] = (ushort)in[i+0];
2353 (out+j)[1] = (ushort)in[i+1];
2354 (out+j)[2] = (ushort)in[i+2];
2355 (out+j)[3] = (ushort)in[i+3];
2367 ushort *out = (ushort*)_out;
2371 (out+j)[0] = (ushort)in[i+0];
2372 (out+j)[1] = (ushort)in[i+1];
2373 (out+j)[2] = (ushort)in[i+2];
2374 (out+j)[3] = (ushort)in[i+3];
2386 ushort *out = (ushort*)_out;
2390 (out+j)[0] = (ushort)in[i+0];
2391 (out+j)[1] = (ushort)in[i+1];
2392 (out+j)[2] = (ushort)in[i+2];
2393 (out+j)[3] = (ushort)in[i+3];
2394 (out+j)[4] = (ushort)in[i+4];
2395 (out+j)[5] = (ushort)in[i+5];
2407 ushort *out = (ushort*)_out;
2413 (out+j)[0] = (ushort)in[i+0];
2414 (out+j)[1] = (ushort)in[i+1];
2415 (out+j)[2] = (ushort)in[i+2];
2416 (out+j)[3] = (ushort)in[i+3];
2417 (out+j)[4] = (ushort)in[i+4];
2418 (out+j)[5] = (ushort)in[i+5];
2421 (out+j)[0] = (ushort)in[i+2];
2422 (out+j)[1] = (ushort)in[i-2];
2423 (out+j)[2] = (ushort)in[i+0];
2424 (out+j)[3] = (ushort)in[i+3];
2425 (out+j)[4] = (ushort)in[i+4];
2426 (out+j)[5] = (ushort)in[i+6];
2439 ushort *out = (ushort*)_out;
2443 (out+j)[0] = (ushort)in[i];
2455 ushort *out = (ushort*)_out;
2459 (out+j)[0] = (ushort)in[i+1];
2460 (out+j)[1] = (ushort)in[i];
2472 ushort *out = (ushort*)_out;
2476 (out+j)[0] = (ushort)in[i+1];
2477 (out+j)[1] = (ushort)in[i];
2489 ushort *out = (ushort*)_out;
2493 (out+j)[0] = (ushort)in[i+1];
2494 (out+j)[1] = (ushort)in[i];
2496 (out+j)[0] = (ushort)in[start];
2497 (out+j)[1] = (ushort)in[i];
2508 ushort *out = (ushort*)_out;
2512 (out+j)[0] = (ushort)in[i+1];
2513 (out+j)[1] = (ushort)in[i+2];
2514 (out+j)[2] = (ushort)in[i];
2526 ushort *out = (ushort*)_out;
2530 (out+j)[0] = (ushort)in[i+1+(i&1)];
2531 (out+j)[1] = (ushort)in[i+2-(i&1)];
2532 (out+j)[2] = (ushort)in[i];
2544 ushort *out = (ushort*)_out;
2548 (out+j)[0] = (ushort)in[i+1];
2549 (out+j)[1] = (ushort)in[i+2];
2550 (out+j)[2] = (ushort)in[start];
2562 ushort *out = (ushort*)_out;
2566 (out+j+0)[0] = (ushort)in[i+1];
2567 (out+j+0)[1] = (ushort)in[i+2];
2568 (out+j+0)[2] = (ushort)in[i+0];
2569 (out+j+3)[0] = (ushort)in[i+2];
2570 (out+j+3)[1] = (ushort)in[i+3];
2571 (out+j+3)[2] = (ushort)in[i+0];
2583 ushort *out = (ushort*)_out;
2587 (out+j+0)[0] = (ushort)in[i+1];
2588 (out+j+0)[1] = (ushort)in[i+3];
2589 (out+j+0)[2] = (ushort)in[i+0];
2590 (out+j+3)[0] = (ushort)in[i+3];
2591 (out+j+3)[1] = (ushort)in[i+2];
2592 (out+j+3)[2] = (ushort)in[i+0];
2604 ushort *out = (ushort*)_out;
2608 (out+j)[0] = (ushort)in[i+1];
2609 (out+j)[1] = (ushort)in[i+2];
2610 (out+j)[2] = (ushort)in[start];
2622 ushort *out = (ushort*)_out;
2626 (out+j)[0] = (ushort)in[i+3];
2627 (out+j)[1] = (ushort)in[i+2];
2628 (out+j)[2] = (ushort)in[i+1];
2629 (out+j)[3] = (ushort)in[i+0];
2641 ushort *out = (ushort*)_out;
2645 (out+j)[0] = (ushort)in[i+3];
2646 (out+j)[1] = (ushort)in[i+2];
2647 (out+j)[2] = (ushort)in[i+1];
2648 (out+j)[3] = (ushort)in[i+0];
2660 ushort *out = (ushort*)_out;
2664 (out+j)[0] = (ushort)in[i+4];
2665 (out+j)[1] = (ushort)in[i+5];
2666 (out+j)[2] = (ushort)in[i+0];
2667 (out+j)[3] = (ushort)in[i+1];
2668 (out+j)[4] = (ushort)in[i+2];
2669 (out+j)[5] = (ushort)in[i+3];
2681 ushort *out = (ushort*)_out;
2687 (out+j)[0] = (ushort)in[i+4];
2688 (out+j)[1] = (ushort)in[i+5];
2689 (out+j)[2] = (ushort)in[i+0];
2690 (out+j)[3] = (ushort)in[i+1];
2691 (out+j)[4] = (ushort)in[i+2];
2692 (out+j)[5] = (ushort)in[i+3];
2695 (out+j)[0] = (ushort)in[i+4];
2696 (out+j)[1] = (ushort)in[i+6];
2697 (out+j)[2] = (ushort)in[i+2];
2698 (out+j)[3] = (ushort)in[i-2];
2699 (out+j)[4] = (ushort)in[i+0];
2700 (out+j)[5] = (ushort)in[i+3];
2713 ushort *out = (ushort*)_out;
2717 (out+j)[0] = (ushort)in[i];
2729 ushort *out = (ushort*)_out;
2733 (out+j)[0] = (ushort)in[i+1];
2734 (out+j)[1] = (ushort)in[i];
2746 ushort *out = (ushort*)_out;
2750 (out+j)[0] = (ushort)in[i+1];
2751 (out+j)[1] = (ushort)in[i];
2763 ushort *out = (ushort*)_out;
2767 (out+j)[0] = (ushort)in[i+1];
2768 (out+j)[1] = (ushort)in[i];
2770 (out+j)[0] = (ushort)in[start];
2771 (out+j)[1] = (ushort)in[i];
2782 ushort *out = (ushort*)_out;
2786 (out+j)[0] = (ushort)in[i+1];
2787 (out+j)[1] = (ushort)in[i+2];
2788 (out+j)[2] = (ushort)in[i];
2800 ushort *out = (ushort*)_out;
2804 (out+j)[0] = (ushort)in[i+1+(i&1)];
2805 (out+j)[1] = (ushort)in[i+2-(i&1)];
2806 (out+j)[2] = (ushort)in[i];
2818 ushort *out = (ushort*)_out;
2822 (out+j)[0] = (ushort)in[i+1];
2823 (out+j)[1] = (ushort)in[i+2];
2824 (out+j)[2] = (ushort)in[start];
2836 ushort *out = (ushort*)_out;
2842 (out+j+0)[0] = restart_index;
2843 (out+j+0)[1] = restart_index;
2844 (out+j+0)[2] = restart_index;
2845 (out+j+3)[0] = restart_index;
2846 (out+j+3)[1] = restart_index;
2847 (out+j+3)[2] = restart_index;
2866 (out+j+0)[0] = (ushort)in[i+1];
2867 (out+j+0)[1] = (ushort)in[i+2];
2868 (out+j+0)[2] = (ushort)in[i+0];
2869 (out+j+3)[0] = (ushort)in[i+2];
2870 (out+j+3)[1] = (ushort)in[i+3];
2871 (out+j+3)[2] = (ushort)in[i+0];
2883 ushort *out = (ushort*)_out;
2889 (out+j+0)[0] = restart_index;
2890 (out+j+0)[1] = restart_index;
2891 (out+j+0)[2] = restart_index;
2892 (out+j+3)[0] = restart_index;
2893 (out+j+3)[1] = restart_index;
2894 (out+j+3)[2] = restart_index;
2913 (out+j+0)[0] = (ushort)in[i+1];
2914 (out+j+0)[1] = (ushort)in[i+3];
2915 (out+j+0)[2] = (ushort)in[i+0];
2916 (out+j+3)[0] = (ushort)in[i+3];
2917 (out+j+3)[1] = (ushort)in[i+2];
2918 (out+j+3)[2] = (ushort)in[i+0];
2930 ushort *out = (ushort*)_out;
2936 (out+j+0)[0] = restart_index;
2937 (out+j+0)[1] = restart_index;
2938 (out+j+0)[2] = restart_index;
2956 (out+j)[0] = (ushort)in[i+1];
2957 (out+j)[1] = (ushort)in[i+2];
2958 (out+j)[2] = (ushort)in[start];
2970 ushort *out = (ushort*)_out;
2974 (out+j)[0] = (ushort)in[i+3];
2975 (out+j)[1] = (ushort)in[i+2];
2976 (out+j)[2] = (ushort)in[i+1];
2977 (out+j)[3] = (ushort)in[i+0];
2989 ushort *out = (ushort*)_out;
2993 (out+j)[0] = (ushort)in[i+3];
2994 (out+j)[1] = (ushort)in[i+2];
2995 (out+j)[2] = (ushort)in[i+1];
2996 (out+j)[3] = (ushort)in[i+0];
3008 ushort *out = (ushort*)_out;
3012 (out+j)[0] = (ushort)in[i+4];
3013 (out+j)[1] = (ushort)in[i+5];
3014 (out+j)[2] = (ushort)in[i+0];
3015 (out+j)[3] = (ushort)in[i+1];
3016 (out+j)[4] = (ushort)in[i+2];
3017 (out+j)[5] = (ushort)in[i+3];
3029 ushort *out = (ushort*)_out;
3035 (out+j)[0] = (ushort)in[i+4];
3036 (out+j)[1] = (ushort)in[i+5];
3037 (out+j)[2] = (ushort)in[i+0];
3038 (out+j)[3] = (ushort)in[i+1];
3039 (out+j)[4] = (ushort)in[i+2];
3040 (out+j)[5] = (ushort)in[i+3];
3043 (out+j)[0] = (ushort)in[i+4];
3044 (out+j)[1] = (ushort)in[i+6];
3045 (out+j)[2] = (ushort)in[i+2];
3046 (out+j)[3] = (ushort)in[i-2];
3047 (out+j)[4] = (ushort)in[i+0];
3048 (out+j)[5] = (ushort)in[i+3];
3061 ushort *out = (ushort*)_out;
3065 (out+j)[0] = (ushort)in[i];
3077 ushort *out = (ushort*)_out;
3081 (out+j)[0] = (ushort)in[i+1];
3082 (out+j)[1] = (ushort)in[i];
3094 ushort *out = (ushort*)_out;
3098 (out+j)[0] = (ushort)in[i+1];
3099 (out+j)[1] = (ushort)in[i];
3111 ushort *out = (ushort*)_out;
3115 (out+j)[0] = (ushort)in[i+1];
3116 (out+j)[1] = (ushort)in[i];
3118 (out+j)[0] = (ushort)in[start];
3119 (out+j)[1] = (ushort)in[i];
3130 ushort *out = (ushort*)_out;
3134 (out+j)[0] = (ushort)in[i+2];
3135 (out+j)[1] = (ushort)in[i];
3136 (out+j)[2] = (ushort)in[i+1];
3148 ushort *out = (ushort*)_out;
3152 (out+j)[0] = (ushort)in[i+2];
3153 (out+j)[1] = (ushort)in[i+(i&1)];
3154 (out+j)[2] = (ushort)in[i+1-(i&1)];
3166 ushort *out = (ushort*)_out;
3170 (out+j)[0] = (ushort)in[i+2];
3171 (out+j)[1] = (ushort)in[start];
3172 (out+j)[2] = (ushort)in[i+1];
3184 ushort *out = (ushort*)_out;
3188 (out+j+0)[0] = (ushort)in[i+3];
3189 (out+j+0)[1] = (ushort)in[i+0];
3190 (out+j+0)[2] = (ushort)in[i+1];
3191 (out+j+3)[0] = (ushort)in[i+3];
3192 (out+j+3)[1] = (ushort)in[i+1];
3193 (out+j+3)[2] = (ushort)in[i+2];
3205 ushort *out = (ushort*)_out;
3209 (out+j+0)[0] = (ushort)in[i+3];
3210 (out+j+0)[1] = (ushort)in[i+2];
3211 (out+j+0)[2] = (ushort)in[i+0];
3212 (out+j+3)[0] = (ushort)in[i+3];
3213 (out+j+3)[1] = (ushort)in[i+0];
3214 (out+j+3)[2] = (ushort)in[i+1];
3226 ushort *out = (ushort*)_out;
3230 (out+j)[0] = (ushort)in[start];
3231 (out+j)[1] = (ushort)in[i+1];
3232 (out+j)[2] = (ushort)in[i+2];
3244 ushort *out = (ushort*)_out;
3248 (out+j)[0] = (ushort)in[i+3];
3249 (out+j)[1] = (ushort)in[i+2];
3250 (out+j)[2] = (ushort)in[i+1];
3251 (out+j)[3] = (ushort)in[i+0];
3263 ushort *out = (ushort*)_out;
3267 (out+j)[0] = (ushort)in[i+3];
3268 (out+j)[1] = (ushort)in[i+2];
3269 (out+j)[2] = (ushort)in[i+1];
3270 (out+j)[3] = (ushort)in[i+0];
3282 ushort *out = (ushort*)_out;
3286 (out+j)[0] = (ushort)in[i+4];
3287 (out+j)[1] = (ushort)in[i+5];
3288 (out+j)[2] = (ushort)in[i+0];
3289 (out+j)[3] = (ushort)in[i+1];
3290 (out
3291 (out+j)[5] = (ushort)in[i+3];
3303 ushort *out = (ushort*)_out;
3309 (out+j)[0] = (ushort)in[i+4];
3310 (out+j)[1] = (ushort)in[i+5];
3311 (out+j)[2] = (ushort)in[i+0];
3312 (out+j)[3] = (ushort)in[i+1];
3313 (out+j)[4] = (ushort)in[i+2];
3314 (out+j)[5] = (ushort)in[i+3];
3317 (out+j)[0] = (ushort)in[i+4];
3318 (out+j)[1] = (ushort)in[i+6];
3319 (out+j)[2] = (ushort)in[i+2];
3320 (out+j)[3] = (ushort)in[i-2];
3321 (out+j)[4] = (ushort)in[i+0];
3322 (out+j)[5] = (ushort)in[i+3];
3335 ushort *out = (ushort*)_out;
3339 (out+j)[0] = (ushort)in[i];
3351 ushort *out = (ushort*)_out;
3355 (out+j)[0] = (ushort)in[i+1];
3356 (out+j)[1] = (ushort)in[i];
3368 ushort *out = (ushort*)_out;
3372 (out+j)[0] = (ushort)in[i+1];
3373 (out+j)[1] = (ushort)in[i];
3385 ushort *out = (ushort*)_out;
3389 (out+j)[0] = (ushort)in[i+1];
3390 (out+j)[1] = (ushort)in[i];
3392 (out+j)[0] = (ushort)in[start];
3393 (out+j)[1] = (ushort)in[i];
3404 ushort *out = (ushort*)_out;
3408 (out+j)[0] = (ushort)in[i+2];
3409 (out+j)[1] = (ushort)in[i];
3410 (out+j)[2] = (ushort)in[i+1];
3422 ushort *out = (ushort*)_out;
3426 (out+j)[0] = (ushort)in[i+2];
3427 (out+j)[1] = (ushort)in[i+(i&1)];
3428 (out+j)[2] = (ushort)in[i+1-(i&1)];
3440 ushort *out = (ushort*)_out;
3444 (out+j)[0] = (ushort)in[i+2];
3445 (out+j)[1] = (ushort)in[start];
3446 (out+j)[2] = (ushort)in[i+1];
3458 ushort *out = (ushort*)_out;
3464 (out+j+0)[0] = restart_index;
3465 (out+j+0)[1] = restart_index;
3466 (out+j+0)[2] = restart_index;
3467 (out+j+3)[0] = restart_index;
3468 (out+j+3)[1] = restart_index;
3469 (out+j+3)[2] = restart_index;
3488 (out+j+0)[0] = (ushort)in[i+3];
3489 (out+j+0)[1] = (ushort)in[i+0];
3490 (out+j+0)[2] = (ushort)in[i+1];
3491 (out+j+3)[0] = (ushort)in[i+3];
3492 (out+j+3)[1] = (ushort)in[i+1];
3493 (out+j+3)[2] = (ushort)in[i+2];
3505 ushort *out = (ushort*)_out;
3511 (out+j+0)[0] = restart_index;
3512 (out+j+0)[1] = restart_index;
3513 (out+j+0)[2] = restart_index;
3514 (out+j+3)[0] = restart_index;
3515 (out+j+3)[1] = restart_index;
3516 (out+j+3)[2] = restart_index;
3535 (out+j+0)[0] = (ushort)in[i+3];
3536 (out+j+0)[1] = (ushort)in[i+2];
3537 (out+j+0)[2] = (ushort)in[i+0];
3538 (out+j+3)[0] = (ushort)in[i+3];
3539 (out+j+3)[1] = (ushort)in[i+0];
3540 (out+j+3)[2] = (ushort)in[i+1];
3552 ushort *out = (ushort*)_out;
3558 (out+j+0)[0] = restart_index;
3559 (out+j+0)[1] = restart_index;
3560 (out+j+0)[2] = restart_index;
3578 (out+j)[0] = (ushort)in[start];
3579 (out+j)[1] = (ushort)in[i+1];
3580 (out+j)[2] = (ushort)in[i+2];
3592 ushort *out = (ushort*)_out;
3596 (out+j)[0] = (ushort)in[i+3];
3597 (out+j)[1] = (ushort)in[i+2];
3598 (out+j)[2] = (ushort)in[i+1];
3599 (out+j)[3] = (ushort)in[i+0];
3611 ushort *out = (ushort*)_out;
3615 (out+j)[0] = (ushort)in[i+3];
3616 (out+j)[1] = (ushort)in[i+2];
3617 (out+j)[2] = (ushort)in[i+1];
3618 (out+j)[3] = (ushort)in[i+0];
3630 ushort *out = (ushort*)_out;
3634 (out+j)[0] = (ushort)in[i+4];
3635 (out+j)[1] = (ushort)in[i+5];
3636 (out+j)[2] = (ushort)in[i+0];
3637 (out+j)[3] = (ushort)in[i+1];
3638 (out+j)[4] = (ushort)in[i+2];
3639 (out+j)[5] = (ushort)in[i+3];
3651 ushort *out = (ushort*)_out;
3657 (out+j)[0] = (ushort)in[i+4];
3658 (out+j)[1] = (ushort)in[i+5];
3659 (out+j)[2] = (ushort)in[i+0];
3660 (out+j)[3] = (ushort)in[i+1];
3661 (out+j)[4] = (ushort)in[i+2];
3662 (out+j)[5] = (ushort)in[i+3];
3665 (out+j)[0] = (ushort)in[i+4];
3666 (out+j)[1] = (ushort)in[i+6];
3667 (out+j)[2] = (ushort)in[i+2];
3668 (out+j)[3] = (ushort)in[i-2];
3669 (out+j)[4] = (ushort)in[i+0];
3670 (out+j)[5] = (ushort)in[i+3];
3683 ushort *out = (ushort*)_out;
3687 (out+j)[0] = (ushort)in[i];
3699 ushort *out = (ushort*)_out;
3703 (out+j)[0] = (ushort)in[i];
3704 (out+j)[1] = (ushort)in[i+1];
3716 ushort *out = (ushort*)_out;
3720 (out+j)[0] = (ushort)in[i];
3721 (out+j)[1] = (ushort)in[i+1];
3733 ushort *out = (ushort*)_out;
3737 (out+j)[0] = (ushort)in[i];
3738 (out+j)[1] = (ushort)in[i+1];
3740 (out+j)[0] = (ushort)in[i];
3741 (out+j)[1] = (ushort)in[start];
3752 ushort *out = (ushort*)_out;
3756 (out+j)[0] = (ushort)in[i];
3757 (out+j)[1] = (ushort)in[i+1];
3758 (out+j)[2] = (ushort)in[i+2];
3770 ushort *out = (ushort*)_out;
3774 (out+j)[0] = (ushort)in[i+(i&1)];
3775 (out+j)[1] = (ushort)in[i+1-(i&1)];
3776 (out+j)[2] = (ushort)in[i+2];
3788 ushort *out = (ushort*)_out;
3792 (out+j)[0] = (ushort)in[start];
3793 (out+j)[1] = (ushort)in[i+1];
3794 (out+j)[2] = (ushort)in[i+2];
3806 ushort *out = (ushort*)_out;
3810 (out+j+0)[0] = (ushort)in[i+0];
3811 (out+j+0)[1] = (ushort)in[i+1];
3812 (out+j+0)[2] = (ushort)in[i+3];
3813 (out+j+3)[0] = (ushort)in[i+1];
3814 (out+j+3)[1] = (ushort)in[i+2];
3815 (out+j+3)[2] = (ushort)in[i+3];
3827 ushort *out = (ushort*)_out;
3831 (out+j+0)[0] = (ushort)in[i+2];
3832 (out+j+0)[1] = (ushort)in[i+0];
3833 (out+j+0)[2] = (ushort)in[i+3];
3834 (out+j+3)[0] = (ushort)in[i+0];
3835 (out+j+3)[1] = (ushort)in[i+1];
3836 (out+j+3)[2] = (ushort)in[i+3];
3848 ushort *out = (ushort*)_out;
3852 (out+j)[0] = (ushort)in[i+1];
3853 (out+j)[1] = (ushort)in[i+2];
3854 (out+j)[2] = (ushort)in[start];
3866 ushort *out = (ushort*)_out;
3870 (out+j)[0] = (ushort)in[i+0];
3871 (out+j)[1] = (ushort)in[i+1];
3872 (out+j)[2] = (ushort)in[i+2];
3873 (out+j)[3] = (ushort)in[i+3];
3885 ushort *out = (ushort*)_out;
3889 (out+j)[0] = (ushort)in[i+0];
3890 (out+j)[1] = (ushort)in[i+1];
3891 (out+j)[2] = (ushort)in[i+2];
3892 (out+j)[3] = (ushort)in[i+3];
3904 ushort *out = (ushort*)_out;
3908 (out+j)[0] = (ushort)in[i+0];
3909 (out+j)[1] = (ushort)in[i+1];
3910 (out+j)[2] = (ushort)in[i+2];
3911 (out+j)[3] = (ushort)in[i+3];
3912 (out+j)[4] = (ushort)in[i+4];
3913 (out+j)[5] = (ushort)in[i+5];
3925 out = (ushort*)_out;
3931 (out+j)[0] = (ushort)in[i+0];
3932 (out+j)[1] = (ushort)in[i+1];
3933 (out+j)[2] = (ushort)in[i+2];
3934 (out+j)[3] = (ushort)in[i+3];
3935 (out+j)[4] = (ushort)in[i+4];
3936 (out+j)[5] = (ushort)in[i+5];
3939 (out+j)[0] = (ushort)in[i+2];
3940 (out+j)[1] = (ushort)in[i-2];
3941 (out+j)[2] = (ushort)in[i+0];
3942 (out+j)[3] = (ushort)in[i+3];
3943 (out+j)[4] = (ushort)in[i+4];
3944 (out+j)[5] = (ushort)in[i+6];
3957 ushort *out = (ushort*)_out;
3961 (out+j)[0] = (ushort)in[i];
3973 ushort *out = (ushort*)_out;
3977 (out+j)[0] = (ushort)in[i];
3978 (out+j)[1] = (ushort)in[i+1];
3990 ushort *out = (ushort*)_out;
3994 (out+j)[0] = (ushort)in[i];
3995 (out+j)[1] = (ushort)in[i+1];
4007 ushort *out = (ushort*)_out;
4011 (out+j)[0] = (ushort)in[i];
4012 (out+j)[1] = (ushort)in[i+1];
4014 (out+j)[0] = (ushort)in[i];
4015 (out+j)[1] = (ushort)in[start];
4026 ushort *out = (ushort*)_out;
4030 (out+j)[0] = (ushort)in[i];
4031 (out+j)[1] = (ushort)in[i+1];
4032 (out+j)[2] = (ushort)in[i+2];
4044 ushort *out = (ushort*)_out;
4048 (out+j)[0] = (ushort)in[i+(i&1)];
4049 (out+j)[1] = (ushort)in[i+1-(i&1)];
4050 (out+j)[2] = (ushort)in[i+2];
4062 ushort *out = (ushort*)_out;
4066 (out+j)[0] = (ushort)in[start];
4067 (out+j)[1] = (ushort)in[i+1];
4068 (out+j)[2] = (ushort)in[i+2];
4080 ushort *out = (ushort*)_out;
4086 (out+j+0)[0] = restart_index;
4087 (out+j+0)[1] = restart_index;
4088 (out+j+0)[2] = restart_index;
4089 (out+j+3)[0] = restart_index;
4090 (out+j+3)[1] = restart_index;
4091 (out+j+3)[2] = restart_index;
4110 (out+j+0)[0] = (ushort)in[i+0];
4111 (out+j+0)[1] = (ushort)in[i+1];
4112 (out+j+0)[2] = (ushort)in[i+3];
4113 (out+j+3)[0] = (ushort)in[i+1];
4114 (out+j+3)[1] = (ushort)in[i+2];
4115 (out+j+3)[2] = (ushort)in[i+3];
4127 ushort *out = (ushort*)_out;
4133 (out+j+0)[0] = restart_index;
4134 (out+j+0)[1] = restart_index;
4135 (out+j+0)[2] = restart_index;
4136 (out+j+3)[0] = restart_index;
4137 (out+j+3)[1] = restart_index;
4138 (out+j+3)[2] = restart_index;
4157 (out+j+0)[0] = (ushort)in[i+2];
4158 (out+j+0)[1] = (ushort)in[i+0];
4159 (out+j+0)[2] = (ushort)in[i+3];
4160 (out+j+3)[0] = (ushort)in[i+0];
4161 (out+j+3)[1] = (ushort)in[i+1];
4162 (out+j+3)[2] = (ushort)in[i+3];
4174 ushort *out = (ushort*)_out;
4180 (out+j+0)[0] = restart_index;
4181 (out+j+0)[1] = restart_index;
4182 (out+j+0)[2] = restart_index;
4200 (out+j)[0] = (ushort)in[i+1];
4201 (out+j)[1] = (ushort)in[i+2];
4202 (out+j)[2] = (ushort)in[start];
4214 ushort *out = (ushort*)_out;
4218 (out+j)[0] = (ushort)in[i+0];
4219 (out+j)[1] = (ushort)in[i+1];
4220 (out+j)[2] = (ushort)in[i+2];
4221 (out+j)[3] = (ushort)in[i+3];
4233 ushort *out = (ushort*)_out;
4237 (out+j)[0] = (ushort)in[i+0];
4238 (out+j)[1] = (ushort)in[i+1];
4239 (out+j)[2] = (ushort)in[i+2];
4240 (out+j)[3] = (ushort)in[i+3];
4252 ushort *out = (ushort*)_out;
4256 (out+j)[0] = (ushort)in[i+0];
4257 (out+j)[1] = (ushort)in[i+1];
4258 (out+j)[2] = (ushort)in[i+2];
4259 (out+j)[3] = (ushort)in[i+3];
4260 (out+j)[4] = (ushort)in[i+4];
4261 (out+j)[5] = (ushort)in[i+5];
4273 ushort *out = (ushort*)_out;
4279 (out+j)[0] = (ushort)in[i+0];
4280 (out+j)[1] = (ushort)in[i+1];
4281 (out+j)[2] = (ushort)in[i+2];
4282 (out+j)[3] = (ushort)in[i+3];
4283 (out+j)[4] = (ushort)in[i+4];
4284 (out+j)[5] = (ushort)in[i+5];
4287 (out+j)[0] = (ushort)in[i+2];
4288 (out+j)[1] = (ushort)in[i-2];
4289 (out+j)[2] = (ushort)in[i+0];
4290 (out+j)[3] = (ushort)in[i+3];
4291 (out+j)[4] = (ushort)in[i+4];
4292 (out+j)[5] = (ushort)in[i+6];
4305 uint *out = (uint*)_out;
4309 (out+j)[0] = (uint)in[i];
4321 uint *out = (uint*)_out;
4325 (out+j)[0] = (uint)in[i];
4326 (out+j)[1] = (uint)in[i+1];
4338 uint *out = (uint*)_out;
4342 (out+j)[0] = (uint)in[i];
4343 (out+j)[1] = (uint)in[i+1];
4355 uint *out = (uint*)_out;
4359 (out+j)[0] = (uint)in[i];
4360 (out+j)[1] = (uint)in[i+1];
4362 (out+j)[0] = (uint)in[i];
4363 (out+j)[1] = (uint)in[start];
4374 uint *out = (uint*)_out;
4378 (out+j)[0] = (uint)in[i];
4379 (out+j)[1] = (uint)in[i+1];
4380 (out+j)[2] = (uint)in[i+2];
4392 uint *out = (uint*)_out;
4396 (out+j)[0] = (uint)in[i];
4397 (out+j)[1] = (uint)in[i+1+(i&1)];
4398 (out+j)[2] = (uint)in[i+2-(i&1)];
4410 uint *out = (uint*)_out;
4414 (out+j)[0] = (uint)in[start];
4415 (out+j)[1] = (uint)in[i+1];
4416 (out+j)[2] = (uint)in[i+2];
4428 uint *out = (uint*)_out;
4432 (out+j+0)[0] = (uint)in[i+0];
4433 (out+j+0)[1] = (uint)in[i+1];
4434 (out+j+0)[2] = (uint)in[i+2];
4435 (out+j+3)[0] = (uint)in[i+0];
4436 (out+j+3)[1] = (uint)in[i+2];
4437 (out+j+3)[2] = (uint)in[i+3];
4449 uint *out = (uint*)_out;
4453 (out+j+0)[0] = (uint)in[i+0];
4454 (out+j+0)[1] = (uint)in[i+1];
4455 (out+j+0)[2] = (uint)in[i+3];
4456 (out+j+3)[0] = (uint)in[i+0];
4457 (out+j+3)[1] = (uint)in[i+3];
4458 (out+j+3)[2] = (uint)in[i+2];
4470 uint *out = (uint*)_out;
4474 (out+j)[0] = (uint)in[start];
4475 (out+j)[1] = (uint)in[i+1];
4476 (out+j)[2] = (uint)in[i+2];
4488 uint *out = (uint*)_out;
4492 (out+j)[0] = (uint)in[i+0];
4493 (out+j)[1] = (uint)in[i+1];
4494 (out+j)[2] = (uint)in[i+2];
4495 (out+j)[3] = (uint)in[i+3];
4507 uint *out = (uint*)_out;
4511 (out+j)[0] = (uint)in[i+0];
4512 (out+j)[1] = (uint)in[i+1];
4513 (out+j)[2] = (uint)in[i+2];
4514 (out+j)[3] = (uint)in[i+3];
4526 uint *out = (uint*)_out;
4530 (out+j)[0] = (uint)in[i+0];
4531 (out+j)[1] = (uint)in[i+1];
4532 (out+j)[2] = (uint)in[i+2];
4533 (out+j)[3] = (uint)in[i+3];
4534 (out+j)[4] = (uint)in[i+4];
4535 (out+j)[5] = (uint)in[i+5];
4547 uint *out = (uint*)_out;
4553 (out+j)[0] = (uint)in[i+0];
4554 (out+j)[1] = (uint)in[i+1];
4555 (out+j)[2] = (uint)in[i+2];
4556 (out+j)[3] = (uint)in[i+3];
4557 (out+j)[4] = (uint)in[i+4];
4558 (out+j)[5] = (uint)in[i+5];
4561 (out+j)[0] = (uint)in[i+2];
4562 (out
4563 (out+j)[2] = (uint)in[i+0];
4564 (out+j)[3] = (uint)in[i+3];
4565 (out+j)[4] = (uint)in[i+4];
4566 (out+j)[5] = (uint)in[i+6];
4579 uint *out = (uint*)_out;
4583 (out+j)[0] = (uint)in[i];
4595 uint *out = (uint*)_out;
4599 (out+j)[0] = (uint)in[i];
4600 (out+j)[1] = (uint)in[i+1];
4612 uint *out = (uint*)_out;
4616 (out+j)[0] = (uint)in[i];
4617 (out+j)[1] = (uint)in[i+1];
4629 uint *out = (uint*)_out;
4633 (out+j)[0] = (uint)in[i];
4634 (out+j)[1] = (uint)in[i+1];
4636 (out+j)[0] = (uint)in[i];
4637 (out+j)[1] = (uint)in[start];
4648 uint *out = (uint*)_out;
4652 (out+j)[0] = (uint)in[i];
4653 (out+j)[1] = (uint)in[i+1];
4654 (out+j)[2] = (uint)in[i+2];
4666 uint *out = (uint*)_out;
4670 (out+j)[0] = (uint)in[i];
4671 (out+j)[1] = (uint)in[i+1+(i&1)];
4672 (out+j)[2] = (uint)in[i+2-(i&1)];
4684 uint *out = (uint*)_out;
4688 (out+j)[0] = (uint)in[start];
4689 (out+j)[1] = (uint)in[i+1];
4690 (out+j)[2] = (uint)in[i+2];
4702 uint *out = (uint*)_out;
4708 (out+j+0)[0] = restart_index;
4709 (out+j+0)[1] = restart_index;
4710 (out+j+0)[2] = restart_index;
4711 (out+j+3)[0] = restart_index;
4712 (out+j+3)[1] = restart_index;
4713 (out+j+3)[2] = restart_index;
4732 (out+j+0)[0] = (uint)in[i+0];
4733 (out+j+0)[1] = (uint)in[i+1];
4734 (out+j+0)[2] = (uint)in[i+2];
4735 (out+j+3)[0] = (uint)in[i+0];
4736 (out+j+3)[1] = (uint)in[i+2];
4737 (out+j+3)[2] = (uint)in[i+3];
4749 uint *out = (uint*)_out;
4755 (out+j+0)[0] = restart_index;
4756 (out+j+0)[1] = restart_index;
4757 (out+j+0)[2] = restart_index;
4758 (out+j+3)[0] = restart_index;
4759 (out+j+3)[1] = restart_index;
4760 (out+j+3)[2] = restart_index;
4779 (out+j+0)[0] = (uint)in[i+0];
4780 (out+j+0)[1] = (uint)in[i+1];
4781 (out+j+0)[2] = (uint)in[i+3];
4782 (out+j+3)[0] = (uint)in[i+0];
4783 (out+j+3)[1] = (uint)in[i+3];
4784 (out+j+3)[2] = (uint)in[i+2];
4796 uint *out = (uint*)_out;
4802 (out+j+0)[0] = restart_index;
4803 (out+j+0)[1] = restart_index;
4804 (out+j+0)[2] = restart_index;
4822 (out+j)[0] = (uint)in[start];
4823 (out+j)[1] = (uint)in[i+1];
4824 (out+j)[2] = (uint)in[i+2];
4836 uint *out = (uint*)_out;
4840 (out+j)[0] = (uint)in[i+0];
4841 (out+j)[1] = (uint)in[i+1];
4842 (out+j)[2] = (uint)in[i+2];
4843 (out+j)[3] = (uint)in[i+3];
4855 uint *out = (uint*)_out;
4859 (out+j)[0] = (uint)in[i+0];
4860 (out+j)[1] = (uint)in[i+1];
4861 (out+j)[2] = (uint)in[i+2];
4862 (out+j)[3] = (uint)in[i+3];
4874 uint *out = (uint*)_out;
4878 (out+j)[0] = (uint)in[i+0];
4879 (out+j)[1] = (uint)in[i+1];
4880 (out+j)[2] = (uint)in[i+2];
4881 (out+j)[3] = (uint)in[i+3];
4882 (out+j)[4] = (uint)in[i+4];
4883 (out+j)[5] = (uint)in[i+5];
4895 uint *out = (uint*)_out;
4901 (out+j)[0] = (uint)in[i+0];
4902 (out+j)[1] = (uint)in[i+1];
4903 (out+j)[2] = (uint)in[i+2];
4904 (out+j)[3] = (uint)in[i+3];
4905 (out+j)[4] = (uint)in[i+4];
4906 (out+j)[5] = (uint)in[i+5];
4909 (out+j)[0] = (uint)in[i+2];
4910 (out+j)[1] = (uint)in[i-2];
4911 (out+j)[2] = (uint)in[i+0];
4912 (out+j)[3] = (uint)in[i+3];
4913 (out+j)[4] = (uint)in[i+4];
4914 (out+j)[5] = (uint)in[i+6];
4927 uint *out = (uint*)_out;
4931 (out+j)[0] = (uint)in[i];
4943 uint *out = (uint*)_out;
4947 (out+j)[0] = (uint)in[i+1];
4948 (out+j)[1] = (uint)in[i];
4960 uint *out = (uint*)_out;
4964 (out+j)[0] = (uint)in[i+1];
4965 (out+j)[1] = (uint)in[i];
4977 uint *out = (uint*)_out;
4981 (out+j)[0] = (uint)in[i+1];
4982 (out+j)[1] = (uint)in[i];
4984 (out+j)[0] = (uint)in[start];
4985 (out+j)[1] = (uint)in[i];
4996 uint *out = (uint*)_out;
5000 (out+j)[0] = (uint)in[i+1];
5001 (out+j)[1] = (uint)in[i+2];
5002 (out+j)[2] = (uint)in[i];
5014 uint *out = (uint*)_out;
5018 (out+j)[0] = (uint)in[i+1+(i&1)];
5019 (out+j)[1] = (uint)in[i+2-(i&1)];
5020 (out+j)[2] = (uint)in[i];
5032 uint *out = (uint*)_out;
5036 (out+j)[0] = (uint)in[i+1];
5037 (out+j)[1] = (uint)in[i+2];
5038 (out+j)[2] = (uint)in[start];
5050 uint *out = (uint*)_out;
5054 (out+j+0)[0] = (uint)in[i+1];
5055 (out+j+0)[1] = (uint)in[i+2];
5056 (out+j+0)[2] = (uint)in[i+0];
5057 (out+j+3)[0] = (uint)in[i+2];
5058 (out+j+3)[1] = (uint)in[i+3];
5059 (out+j+3)[2] = (uint)in[i+0];
5071 uint *out = (uint*)_out;
5075 (out+j+0)[0] = (uint)in[i+1];
5076 (out+j+0)[1] = (uint)in[i+3];
5077 (out+j+0)[2] = (uint)in[i+0];
5078 (out+j+3)[0] = (uint)in[i+3];
5079 (out+j+3)[1] = (uint)in[i+2];
5080 (out+j+3)[2] = (uint)in[i+0];
5092 uint *out = (uint*)_out;
5096 (out+j)[0] = (uint)in[i+1];
5097 (out+j)[1] = (uint)in[i+2];
5098 (out+j)[2] = (uint)in[start];
5110 uint *out = (uint*)_out;
5114 (out+j)[0] = (uint)in[i+3];
5115 (out+j)[1] = (uint)in[i+2];
5116 (out+j)[2] = (uint)in[i+1];
5117 (out+j)[3] = (uint)in[i+0];
5129 uint *out = (uint*)_out;
5133 (out+j)[0] = (uint)in[i+3];
5134 (out+j)[1] = (uint)in[i+2];
5135 (out+j)[2] = (uint)in[i+1];
5136 (out+j)[3] = (uint)in[i+0];
5148 uint *out = (uint*)_out;
5152 (out+j)[0] = (uint)in[i+4];
5153 (out+j)[1] = (uint)in[i+5];
5154 (out+j)[2] = (uint)in[i+0];
5155 (out+j)[3] = (uint)in[i+1];
5156 (out+j)[4] = (uint)in[i+2];
5157 (out+j)[5] = (uint)in[i+3];
5169 uint *out = (uint*)_out;
5175 (out+j)[0] = (uint)in[i+4];
5176 (out+j)[1] = (uint)in[i+5];
5177 (out+j)[2] = (uint)in[i+0];
5178 (out+j)[3] = (uint)in[i+1];
5179 (out+j)[4] = (uint)in[i+2];
5180 (out+j)[5] = (uint)in[i+3];
5183 (out+j)[0] = (uint)in[i+4];
5184 (out+j)[1] = (uint)in[i+6];
5185 (out+j)[2] = (uint)in[i+2];
5186 (out+j)[3] = (uint)in[i-2];
5187 (out+j)[4] = (uint)in[i+0];
5188 (out+j)[5] = (uint)in[i+3];
5201 uint *out = (uint*)_out;
5205 (out+j)[0] = (uint)in[i];
5217 uint *out = (uint*)_out;
5221 (out+j)[0] = (uint)in[i+1];
5222 (out+j)[1] = (uint)in[i];
5234 uint *out = (uint*)_out;
5238 (out+j)[0] = (uint)in[i+1];
5239 (out+j)[1] = (uint)in[i];
5251 uint *out = (uint*)_out;
5255 (out+j)[0] = (uint)in[i+1];
5256 (out+j)[1] = (uint)in[i];
5258 (out+j)[0] = (uint)in[start];
5259 (out+j)[1] = (uint)in[i];
5270 uint *out = (uint*)_out;
5274 (out+j)[0] = (uint)in[i+1];
5275 (out+j)[1] = (uint)in[i+2];
5276 (out+j)[2] = (uint)in[i];
5288 uint *out = (uint*)_out;
5292 (out+j)[0] = (uint)in[i+1+(i&1)];
5293 (out+j)[1] = (uint)in[i+2-(i&1)];
5294 (out+j)[2] = (uint)in[i];
5306 uint *out = (uint*)_out;
5310 (out+j)[0] = (uint)in[i+1];
5311 (out+j)[1] = (uint)in[i+2];
5312 (out+j)[2] = (uint)in[start];
5324 uint *out = (uint*)_out;
5330 (out+j+0)[0] = restart_index;
5331 (out+j+0)[1] = restart_index;
5332 (out+j+0)[2] = restart_index;
5333 (out+j+3)[0] = restart_index;
5334 (out+j+3)[1] = restart_index;
5335 (out+j+3)[2] = restart_index;
5354 (out+j+0)[0] = (uint)in[i+1];
5355 (out+j+0)[1] = (uint)in[i+2];
5356 (out+j+0)[2] = (uint)in[i+0];
5357 (out+j+3)[0] = (uint)in[i+2];
5358 (out+j+3)[1] = (uint)in[i+3];
5359 (out+j+3)[2] = (uint)in[i+0];
5371 uint *out = (uint*)_out;
5377 (out+j+0)[0] = restart_index;
5378 (out+j+0)[1] = restart_index;
5379 (out+j+0)[2] = restart_index;
5380 (out+j+3)[0] = restart_index;
5381 (out+j+3)[1] = restart_index;
5382 (out+j+3)[2] = restart_index;
5401 (out+j+0)[0] = (uint)in[i+1];
5402 (out+j+0)[1] = (uint)in[i+3];
5403 (out+j+0)[2] = (uint)in[i+0];
5404 (out+j+3)[0] = (uint)in[i+3];
5405 (out+j+3)[1] = (uint)in[i+2];
5406 (out+j+3)[2] = (uint)in[i+0];
5418 uint *out = (uint*)_out;
5424 (out+j+0)[0] = restart_index;
5425 (out+j+0)[1] = restart_index;
5426 (out+j+0)[2] = restart_index;
5444 (out+j)[0] = (uint)in[i+1];
5445 (out+j)[1] = (uint)in[i+2];
5446 (out+j)[2] = (uint)in[start];
5458 uint *out = (uint*)_out;
5462 (out+j)[0] = (uint)in[i+3];
5463 (out+j)[1] = (uint)in[i+2];
5464 (out+j)[2] = (uint)in[i+1];
5465 (out+j)[3] = (uint)in[i+0];
5477 uint *out = (uint*)_out;
5481 (out+j)[0] = (uint)in[i+3];
5482 (out+j)[1] = (uint)in[i+2];
5483 (out+j)[2] = (uint)in[i+1];
5484 (out+j)[3] = (uint)in[i+0];
5496 uint *out = (uint*)_out;
5500 (out+j)[0] = (uint)in[i+4];
5501 (out+j)[1] = (uint)in[i+5];
5502 (out+j)[2] = (uint)in[i+0];
5503 (out+j)[3] = (uint)in[i+1];
5504 (out+j)[4] = (uint)in[i+2];
5505 (out+j)[5] = (uint)in[i+3];
5517 uint *out = (uint*)_out;
5523 (out+j)[0] = (uint)in[i+4];
5524 (out+j)[1] = (uint)in[i+5];
5525 (out+j)[2] = (uint)in[i+0];
5526 (out+j)[3] = (uint)in[i+1];
5527 (out+j)[4] = (uint)in[i+2];
5528 (out+j)[5] = (uint)in[i+3];
5531 (out+j)[0] = (uint)in[i+4];
5532 (out+j)[1] = (uint)in[i+6];
5533 (out+j)[2] = (uint)in[i+2];
5534 (out+j)[3] = (uint)in[i-2];
5535 (out+j)[4] = (uint)in[i+0];
5536 (out+j)[5] = (uint)in[i+3];
5549 uint *out = (uint*)_out;
5553 (out+j)[0] = (uint)in[i];
5565 uint *out = (uint*)_out;
5569 (out+j)[0] = (uint)in[i+1];
5570 (out+j)[1] = (uint)in[i];
5582 uint *out = (uint*)_out;
5586 (out+j)[0] = (uint)in[i+1];
5587 (out+j)[1] = (uint)in[i];
5599 uint *out = (uint*)_out;
5603 (out+j)[0] = (uint)in[i+1];
5604 (out+j)[1] = (uint)in[i];
5606 (out+j)[0] = (uint)in[start];
5607 (out+j)[1] = (uint)in[i];
5618 uint *out = (uint*)_out;
5622 (out+j)[0] = (uint)in[i+2];
5623 (out+j)[1] = (uint)in[i];
5624 (out+j)[2] = (uint)in[i+1];
5636 uint *out = (uint*)_out;
5640 (out+j)[0] = (uint)in[i+2];
5641 (out+j)[1] = (uint)in[i+(i&1)];
5642 (out+j)[2] = (uint)in[i+1-(i&1)];
5654 uint *out = (uint*)_out;
5658 (out+j)[0] = (uint)in[i+2];
5659 (out+j)[1] = (uint)in[start];
5660 (out+j)[2] = (uint)in[i+1];
5672 uint *out = (uint*)_out;
5676 (out+j+0)[0] = (uint)in[i+3];
5677 (out+j+0)[1] = (uint)in[i+0];
5678 (out+j+0)[2] = (uint)in[i+1];
5679 (out+j+3)[0] = (uint)in[i+3];
5680 (out+j+3)[1] = (uint)in[i+1];
5681 (out+j+3)[2] = (uint)in[i+2];
5693 uint *out = (uint*)_out;
5697 (out+j+0)[0] = (uint)in[i+3];
5698 (out+j+0)[1] = (uint)in[i+2];
5699 (out+j+0)[2] = (uint)in[i+0];
5700 (out+j+3)[0] = (uint)in[i+3];
5701 (out+j+3)[1] = (uint)in[i+0];
5702 (out+j+3)[2] = (uint)in[i+1];
5714 uint *out = (uint*)_out;
5718 (out+j)[0] = (uint)in[start];
5719 (out+j)[1] = (uint)in[i+1];
5720 (out+j)[2] = (uint)in[i+2];
5732 uint *out = (uint*)_out;
5736 (out+j)[0] = (uint)in[i+3];
5737 (out+j)[1] = (uint)in[i+2];
5738 (out+j)[2] = (uint)in[i+1];
5739 (out+j)[3] = (uint)in[i+0];
5751 uint *out = (uint*)_out;
5755 (out+j)[0] = (uint)in[i+3];
5756 (out+j)[1] = (uint)in[i+2];
5757 (out+j)[2] = (uint)in[i+1];
5758 (out+j)[3] = (uint)in[i+0];
5770 uint *out = (uint*)_out;
5774 (out+j)[0] = (uint)in[i+4];
5775 (out+j)[1] = (uint)in[i+5];
5776 (out+j)[2] = (uint)in[i+0];
5777 (out+j)[3] = (uint)in[i+1];
5778 (out+j)[4] = (uint)in[i+2];
5779 (out+j)[5] = (uint)in[i+3];
5791 uint *out = (uint*)_out;
5797 (out+j)[0] = (uint)in[i+4];
5798 (out+j)[1] = (uint)in[i+5];
5799 (out+j)[2] = (uint)in[i+0];
5800 (out+j)[3] = (uint)in[i+1];
5801 (out+j)[4] = (uint)in[i+2];
5802 (out+j)[5] = (uint)in[i+3];
5805 (out+j)[0] = (uint)in[i+4];
5806 (out+j)[1] = (uint)in[i+6];
5807 (out+j)[2] = (uint)in[i+2];
5808 (out+j)[3] = (uint)in[i-2];
5809 (out+j)[4] = (uint)in[i+0];
5810 (out+j)[5] = (uint)in[i+3];
5823 uint *out = (uint*)_out;
5827 (out+j)[0] = (uint)in[i];
5839 uint *out = (uint*)_out;
5843 (out+j)[0] = (uint)in[i+1];
5844 (out+j)[1] = (uint)in[i];
5856 uint *out = (uint*)_out;
5860 (out+j)[0] = (uint)in[i+1];
5861 (out+j)[1] = (uint)in[i];
5873 uint *out = (uint*)_out;
5877 (out+j)[0] = (uint)in[i+1];
5878 (out+j)[1] = (uint)in[i];
5880 (out+j)[0] = (uint)in[start];
5881 (out+j)[1] = (uint)in[i];
5892 uint *out = (uint*)_out;
5896 (out+j)[0] = (uint)in[i+2];
5897 (out+j)[1] = (uint)in[i];
5898 (out+j)[2] = (uint)in[i+1];
5910 uint *out = (uint*)_out;
5914 (out+j)[0] = (uint)in[i+2];
5915 (out+j)[1] = (uint)in[i+(i&1)];
5916 (out+j)[2] = (uint)in[i+1-(i&1)];
5928 uint *out = (uint*)_out;
5932 (out+j)[0] = (uint)in[i+2];
5933 (out+j)[1] = (uint)in[start];
5934 (out+j)[2] = (uint)in[i+1];
5946 uint *out = (uint*)_out;
5952 (out+j+0)[0] = restart_index;
5953 (out+j+0)[1] = restart_index;
5954 (out+j+0)[2] = restart_index;
5955 (out+j+3)[0] = restart_index;
5956 (out+j+3)[1] = restart_index;
5957 (out+j+3)[2] = restart_index;
5976 (out+j+0)[0] = (uint)in[i+3];
5977 (out+j+0)[1] = (uint)in[i+0];
5978 (out+j+0)[2] = (uint)in[i+1];
5979 (out+j+3)[0] = (uint)in[i+3];
5980 (out+j+3)[1] = (uint)in[i+1];
5981 (out+j+3)[2] = (uint)in[i+2];
5993 uint *out = (uint*)_out;
5999 (out+j+0)[0] = restart_index;
6000 (out+j+0)[1] = restart_index;
6001 (out+j+0)[2] = restart_index;
6002 (out+j+3)[0] = restart_index;
6003 (out+j+3)[1] = restart_index;
6004 (out+j+3)[2] = restart_index;
6023 (out+j+0)[0] = (uint)in[i+3];
6024 (out+j+0)[1] = (uint)in[i+2];
6025 (out+j+0)[2] = (uint)in[i+0];
6026 (out+j+3)[0] = (uint)in[i+3];
6027 (out+j+3)[1] = (uint)in[i+0];
6028 (out+j+3)[2] = (uint)in[i+1];
6040 uint *out = (uint*)_out;
6046 (out+j+0)[0] = restart_index;
6047 (out+j+0)[1] = restart_index;
6048 (out+j+0)[2] = restart_index;
6066 (out+j)[0] = (uint)in[start];
6067 (out+j)[1] = (uint)in[i+1];
6068 (out+j)[2] = (uint)in[i+2];
6080 uint *out = (uint*)_out;
6084 (out+j)[0] = (uint)in[i+3];
6085 (out+j)[1] = (uint)in[i+2];
6086 (out+j)[2] = (uint)in[i+1];
6087 (out+j)[3] = (uint)in[i+0];
6099 uint *out = (uint*)_out;
6103 (out+j)[0] = (uint)in[i+3];
6104 (out+j)[1] = (uint)in[i+2];
6105 (out+j)[2] = (uint)in[i+1];
6106 (out+j)[3] = (uint)in[i+0];
6118 uint *out = (uint*)_out;
6122 (out+j)[0] = (uint)in[i+4];
6123 (out+j)[1] = (uint)in[i+5];
6124 (out+j)[2] = (uint)in[i+0];
6125 (out+j)[3] = (uint)in[i+1];
6126 (out+j)[4] = (uint)in[i+2];
6127 (out+j)[5] = (uint)in[i+3];
6139 uint *out = (uint*)_out;
6145 (out+j)[0] = (uint)in[i+4];
6146 (out+j)[1] = (uint)in[i+5];
6147 (out+j)[2] = (uint)in[i+0];
6148 (out+j)[3] = (uint)in[i+1];
6149 (out+j)[4] = (uint)in[i+2];
6150 (out+j)[5] = (uint)in[i+3];
6153 (out+j)[0] = (uint)in[i+4];
6154 (out+j)[1] = (uint)in[i+6];
6155 (out+j)[2] = (uint)in[i+2];
6156 (out+j)[3] = (uint)in[i-2];
6157 (out+j)[4] = (uint)in[i+0];
6158 (out+j)[5] = (uint)in[i+3];
6171 uint *out = (uint*)_out;
6175 (out+j)[0] = (uint)in[i];
6187 uint *out = (uint*)_out;
6191 (out+j)[0] = (uint)in[i];
6192 (out+j)[1] = (uint)in[i+1];
6204 uint *out = (uint*)_out;
6208 (out+j)[0] = (uint)in[i];
6209 (out+j)[1] = (uint)in[i+1];
6221 uint *out = (uint*)_out;
6225 (out+j)[0] = (uint)in[i];
6226 (out+j)[1] = (uint)in[i+1];
6228 (out+j)[0] = (uint)in[i];
6229 (out+j)[1] = (uint)in[start];
6240 uint *out = (uint*)_out;
6244 (out+j)[0] = (uint)in[i];
6245 (out+j)[1] = (uint)in[i+1];
6246 (out+j)[2] = (uint)in[i+2];
6258 uint *out = (uint*)_out;
6262 (out+j)[0] = (uint)in[i+(i&1)];
6263 (out+j)[1] = (uint)in[i+1-(i&1)];
6264 (out+j)[2] = (uint)in[i+2];
6276 uint *out = (uint*)_out;
6280 (out+j)[0] = (uint)in[start];
6281 (out+j)[1] = (uint)in[i+1];
6282 (out+j)[2] = (uint)in[i+2];
6294 uint *out = (uint*)_out;
6298 (out+j+0)[0] = (uint)in[i+0];
6299 (out+j+0)[1] = (uint)in[i+1];
6300 (out+j+0)[2] = (uint)in[i+3];
6301 (out+j+3)[0] = (uint)in[i+1];
6302 (out+j+3)[1] = (uint)in[i+2];
6303 (out+j+3)[2] = (uint)in[i+3];
6315 uint *out = (uint*)_out;
6319 (out+j+0)[0] = (uint)in[i+2];
6320 (out+j+0)[1] = (uint)in[i+0];
6321 (out+j+0)[2] = (uint)in[i+3];
6322 (out+j+3)[0] = (uint)in[i+0];
6323 (out+j+3)[1] = (uint)in[i+1];
6324 (out+j+3)[2] = (uint)in[i+3];
6336 uint *out = (uint*)_out;
6340 (out+j)[0] = (uint)in[i+1];
6341 (out+j)[1] = (uint)in[i+2];
6342 (out+j)[2] = (uint)in[start];
6354 uint *out = (uint*)_out;
6358 (out+j)[0] = (uint)in[i+0];
6359 (out+j)[1] = (uint)in[i+1];
6360 (out+j)[2] = (uint)in[i+2];
6361 (out+j)[3] = (uint)in[i+3];
6373 uint *out = (uint*)_out;
6377 (out+j)[0] = (uint)in[i+0];
6378 (out+j)[1] = (uint)in[i+1];
6379 (out+j)[2] = (uint)in[i+2];
6380 (out+j)[3] = (uint)in[i+3];
6392 uint *out = (uint*)_out;
6396 (out+j)[0] = (uint)in[i+0];
6397 (out+j)[1] = (uint)in[i+1];
6398 (out+j)[2] = (uint)in[i+2];
6399 (out+j)[3] = (uint)in[i+3];
6400 (out+j)[4] = (uint)in[i+4];
6401 (out+j)[5] = (uint)in[i+5];
6413 uint *out = (uint*)_out;
6419 (out+j)[0] = (uint)in[i+0];
6420 (out+j)[1] = (uint)in[i+1];
6421 (out+j)[2] = (uint)in[i+2];
6422 (out+j)[3] = (uint)in[i+3];
6423 (out+j)[4] = (uint)in[i+4];
6424 (out+j)[5] = (uint)in[i+5];
6427 (out+j)[0] = (uint)in[i+2];
6428 (out+j)[1] = (uint)in[i-2];
6429 (out+j)[2] = (uint)in[i+0];
6430 (out+j)[3] = (uint)in[i+3];
6431 (out+j)[4] = (uint)in[i+4];
6432 (out+j)[5] = (uint)in[i+6];
6445 uint *out = (uint*)_out;
6449 (out+j)[0] = (uint)in[i];
6461 uint *out = (uint*)_out;
6465 (out+j)[0] = (uint)in[i];
6466 (out+j)[1] = (uint)in[i+1];
6478 uint *out = (uint*)_out;
6482 (out+j)[0] = (uint)in[i];
6483 (out+j)[1] = (uint)in[i+1];
6495 uint *out = (uint*)_out;
6499 (out+j)[0] = (uint)in[i];
6500 (out+j)[1] = (uint)in[i+1];
6502 (out+j)[0] = (uint)in[i];
6503 (out+j)[1] = (uint)in[start];
6514 out = (uint*)_out;
6518 (out+j)[0] = (uint)in[i];
6519 (out+j)[1] = (uint)in[i+1];
6520 (out+j)[2] = (uint)in[i+2];
6532 uint *out = (uint*)_out;
6536 (out+j)[0] = (uint)in[i+(i&1)];
6537 (out+j)[1] = (uint)in[i+1-(i&1)];
6538 (out+j)[2] = (uint)in[i+2];
6550 uint *out = (uint*)_out;
6554 (out+j)[0] = (uint)in[start];
6555 (out+j)[1] = (uint)in[i+1];
6556 (out+j)[2] = (uint)in[i+2];
6568 uint *out = (uint*)_out;
6574 (out+j+0)[0] = restart_index;
6575 (out+j+0)[1] = restart_index;
6576 (out+j+0)[2] = restart_index;
6577 (out+j+3)[0] = restart_index;
6578 (out+j+3)[1] = restart_index;
6579 (out+j+3)[2] = restart_index;
6598 (out+j+0)[0] = (uint)in[i+0];
6599 (out+j+0)[1] = (uint)in[i+1];
6600 (out+j+0)[2] = (uint)in[i+3];
6601 (out+j+3)[0] = (uint)in[i+1];
6602 (out+j+3)[1] = (uint)in[i+2];
6603 (out+j+3)[2] = (uint)in[i+3];
6615 uint *out = (uint*)_out;
6621 (out+j+0)[0] = restart_index;
6622 (out+j+0)[1] = restart_index;
6623 (out+j+0)[2] = restart_index;
6624 (out+j+3)[0] = restart_index;
6625 (out+j+3)[1] = restart_index;
6626 (out+j+3)[2] = restart_index;
6645 (out+j+0)[0] = (uint)in[i+2];
6646 (out+j+0)[1] = (uint)in[i+0];
6647 (out+j+0)[2] = (uint)in[i+3];
6648 (out+j+3)[0] = (uint)in[i+0];
6649 (out+j+3)[1] = (uint)in[i+1];
6650 (out+j+3)[2] = (uint)in[i+3];
6662 uint *out = (uint*)_out;
6668 (out+j+0)[0] = restart_index;
6669 (out+j+0)[1] = restart_index;
6670 (out+j+0)[2] = restart_index;
6688 (out+j)[0] = (uint)in[i+1];
6689 (out+j)[1] = (uint)in[i+2];
6690 (out+j)[2] = (uint)in[start];
6702 uint *out = (uint*)_out;
6706 (out+j)[0] = (uint)in[i+0];
6707 (out+j)[1] = (uint)in[i+1];
6708 (out+j)[2] = (uint)in[i+2];
6709 (out+j)[3] = (uint)in[i+3];
6721 uint *out = (uint*)_out;
6725 (out+j)[0] = (uint)in[i+0];
6726 (out+j)[1] = (uint)in[i+1];
6727 (out+j)[2] = (uint)in[i+2];
6728 (out+j)[3] = (uint)in[i+3];
6740 uint *out = (uint*)_out;
6744 (out+j)[0] = (uint)in[i+0];
6745 (out+j)[1] = (uint)in[i+1];
6746 (out+j)[2] = (uint)in[i+2];
6747 (out+j)[3] = (uint)in[i+3];
6748 (out+j)[4] = (uint)in[i+4];
6749 (out+j)[5] = (uint)in[i+5];
6761 uint *out = (uint*)_out;
6767 (out+j)[0] = (uint)in[i+0];
6768 (out+j)[1] = (uint)in[i+1];
6769 (out+j)[2] = (uint)in[i+2];
6770 (out+j)[3] = (uint)in[i+3];
6771 (out+j)[4] = (uint)in[i+4];
6772 (out+j)[5] = (uint)in[i+5];
6775 (out+j)[0] = (uint)in[i+2];
6776 (out+j)[1] = (uint)in[i-2];
6777 (out+j)[2] = (uint)in[i+0];
6778 (out+j)[3] = (uint)in[i+3];
6779 (out+j)[4] = (uint)in[i+4];
6780 (out+j)[5] = (uint)in[i+6];
6793 ushort *out = (ushort*)_out;
6797 (out+j)[0] = (ushort)in[i];
6809 ushort *out = (ushort*)_out;
6813 (out+j)[0] = (ushort)in[i];
6814 (out+j)[1] = (ushort)in[i+1];
6826 ushort *out = (ushort*)_out;
6830 (out+j)[0] = (ushort)in[i];
6831 (out+j)[1] = (ushort)in[i+1];
6843 ushort *out = (ushort*)_out;
6847 (out+j)[0] = (ushort)in[i];
6848 (out+j)[1] = (ushort)in[i+1];
6850 (out+j)[0] = (ushort)in[i];
6851 (out+j)[1] = (ushort)in[start];
6862 ushort *out = (ushort*)_out;
6866 (out+j)[0] = (ushort)in[i];
6867 (out+j)[1] = (ushort)in[i+1];
6868 (out+j)[2] = (ushort)in[i+2];
6880 ushort *out = (ushort*)_out;
6884 (out+j)[0] = (ushort)in[i];
6885 (out+j)[1] = (ushort)in[i+1+(i&1)];
6886 (out+j)[2] = (ushort)in[i+2-(i&1)];
6898 ushort *out = (ushort*)_out;
6902 (out+j)[0] = (ushort)in[start];
6903 (out+j)[1] = (ushort)in[i+1];
6904 (out+j)[2] = (ushort)in[i+2];
6916 ushort *out = (ushort*)_out;
6920 (out+j+0)[0] = (ushort)in[i+0];
6921 (out+j+0)[1] = (ushort)in[i+1];
6922 (out+j+0)[2] = (ushort)in[i+2];
6923 (out+j+3)[0] = (ushort)in[i+0];
6924 (out+j+3)[1] = (ushort)in[i+2];
6925 (out+j+3)[2] = (ushort)in[i+3];
6937 ushort *out = (ushort*)_out;
6941 (out+j+0)[0] = (ushort)in[i+0];
6942 (out+j+0)[1] = (ushort)in[i+1];
6943 (out+j+0)[2] = (ushort)in[i+3];
6944 (out+j+3)[0] = (ushort)in[i+0];
6945 (out+j+3)[1] = (ushort)in[i+3];
6946 (out+j+3)[2] = (ushort)in[i+2];
6958 ushort *out = (ushort*)_out;
6962 (out+j)[0] = (ushort)in[start];
6963 (out+j)[1] = (ushort)in[i+1];
6964 (out+j)[2] = (ushort)in[i+2];
6976 ushort *out = (ushort*)_out;
6980 (out+j)[0] = (ushort)in[i+0];
6981 (out+j)[1] = (ushort)in[i+1];
6982 (out+j)[2] = (ushort)in[i+2];
6983 (out+j)[3] = (ushort)in[i+3];
6995 ushort *out = (ushort*)_out;
6999 (out+j)[0] = (ushort)in[i+0];
7000 (out+j)[1] = (ushort)in[i+1];
7001 (out+j)[2] = (ushort)in[i+2];
7002 (out+j)[3] = (ushort)in[i+3];
7014 ushort *out = (ushort*)_out;
7018 (out+j)[0] = (ushort)in[i+0];
7019 (out+j)[1] = (ushort)in[i+1];
7020 (out+j)[2] = (ushort)in[i+2];
7021 (out+j)[3] = (ushort)in[i+3];
7022 (out+j)[4] = (ushort)in[i+4];
7023 (out+j)[5] = (ushort)in[i+5];
7035 ushort *out = (ushort*)_out;
7041 (out+j)[0] = (ushort)in[i+0];
7042 (out+j)[1] = (ushort)in[i+1];
7043 (out+j)[2] = (ushort)in[i+2];
7044 (out+j)[3] = (ushort)in[i+3];
7045 (out+j)[4] = (ushort)in[i+4];
7046 (out+j)[5] = (ushort)in[i+5];
7049 (out+j)[0] = (ushort)in[i+2];
7050 (out+j)[1] = (ushort)in[i-2];
7051 (out+j)[2] = (ushort)in[i+0];
7052 (out+j)[3] = (ushort)in[i+3];
7053 (out+j)[4] = (ushort)in[i+4];
7054 (out+j)[5] = (ushort)in[i+6];
7067 ushort *out = (ushort*)_out;
7071 (out+j)[0] = (ushort)in[i];
7083 ushort *out = (ushort*)_out;
7087 (out+j)[0] = (ushort)in[i];
7088 (out+j)[1] = (ushort)in[i+1];
7100 ushort *out = (ushort*)_out;
7104 (out+j)[0] = (ushort)in[i];
7105 (out+j)[1] = (ushort)in[i+1];
7117 ushort *out = (ushort*)_out;
7121 (out+j)[0] = (ushort)in[i];
7122 (out+j)[1] = (ushort)in[i+1];
7124 (out+j)[0] = (ushort)in[i];
7125 (out+j)[1] = (ushort)in[start];
7136 ushort *out = (ushort*)_out;
7140 (out+j)[0] = (ushort)in[i];
7141 (out+j)[1] = (ushort)in[i+1];
7142 (out+j)[2] = (ushort)in[i+2];
7154 ushort *out = (ushort*)_out;
7158 (out+j)[0] = (ushort)in[i];
7159 (out+j)[1] = (ushort)in[i+1+(i&1)];
7160 (out+j)[2] = (ushort)in[i+2-(i&1)];
7172 ushort *out = (ushort*)_out;
7176 (out+j)[0] = (ushort)in[start];
7177 (out+j)[1] = (ushort)in[i+1];
7178 (out+j)[2] = (ushort)in[i+2];
7190 ushort *out = (ushort*)_out;
7196 (out+j+0)[0] = restart_index;
7197 (out+j+0)[1] = restart_index;
7198 (out+j+0)[2] = restart_index;
7199 (out+j+3)[0] = restart_index;
7200 (out+j+3)[1] = restart_index;
7201 (out+j+3)[2] = restart_index;
7220 (out+j+0)[0] = (ushort)in[i+0];
7221 (out+j+0)[1] = (ushort)in[i+1];
7222 (out+j+0)[2] = (ushort)in[i+2];
7223 (out+j+3)[0] = (ushort)in[i+0];
7224 (out+j+3)[1] = (ushort)in[i+2];
7225 (out+j+3)[2] = (ushort)in[i+3];
7237 ushort *out = (ushort*)_out;
7243 (out+j+0)[0] = restart_index;
7244 (out+j+0)[1] = restart_index;
7245 (out+j+0)[2] = restart_index;
7246 (out+j+3)[0] = restart_index;
7247 (out+j+3)[1] = restart_index;
7248 (out+j+3)[2] = restart_index;
7267 (out+j+0)[0] = (ushort)in[i+0];
7268 (out+j+0)[1] = (ushort)in[i+1];
7269 (out+j+0)[2] = (ushort)in[i+3];
7270 (out+j+3)[0] = (ushort)in[i+0];
7271 (out+j+3)[1] = (ushort)in[i+3];
7272 (out+j+3)[2] = (ushort)in[i+2];
7284 ushort *out = (ushort*)_out;
7290 (out+j+0)[0] = restart_index;
7291 (out+j+0)[1] = restart_index;
7292 (out+j+0)[2] = restart_index;
7310 (out+j)[0] = (ushort)in[start];
7311 (out+j)[1] = (ushort)in[i+1];
7312 (out+j)[2] = (ushort)in[i+2];
7324 ushort *out = (ushort*)_out;
7328 (out+j)[0] = (ushort)in[i+0];
7329 (out+j)[1] = (ushort)in[i+1];
7330 (out+j)[2] = (ushort)in[i+2];
7331 (out+j)[3] = (ushort)in[i+3];
7343 ushort *out = (ushort*)_out;
7347 (out+j)[0] = (ushort)in[i+0];
7348 (out+j)[1] = (ushort)in[i+1];
7349 (out+j)[2] = (ushort)in[i+2];
7350 (out+j)[3] = (ushort)in[i+3];
7362 ushort *out = (ushort*)_out;
7366 (out+j)[0] = (ushort)in[i+0];
7367 (out+j)[1] = (ushort)in[i+1];
7368 (out+j)[2] = (ushort)in[i+2];
7369 (out+j)[3] = (ushort)in[i+3];
7370 (out+j)[4] = (ushort)in[i+4];
7371 (out+j)[5] = (ushort)in[i+5];
7383 ushort *out = (ushort*)_out;
7389 (out+j)[0] = (ushort)in[i+0];
7390 (out+j)[1] = (ushort)in[i+1];
7391 (out+j)[2] = (ushort)in[i+2];
7392 (out+j)[3] = (ushort)in[i+3];
7393 (out+j)[4] = (ushort)in[i+4];
7394 (out+j)[5] = (ushort)in[i+5];
7397 (out+j)[0] = (ushort)in[i+2];
7398 (out+j)[1] = (ushort)in[i-2];
7399 (out+j)[2] = (ushort)in[i+0];
7400 (out+j)[3] = (ushort)in[i+3];
7401 (out+j)[4] = (ushort)in[i+4];
7402 (out+j)[5] = (ushort)in[i+6];
7415 ushort *out = (ushort*)_out;
7419 (out+j)[0] = (ushort)in[i];
7431 ushort *out = (ushort*)_out;
7435 (out+j)[0] = (ushort)in[i+1];
7436 (out+j)[1] = (ushort)in[i];
7448 ushort *out = (ushort*)_out;
7452 (out+j)[0] = (ushort)in[i+1];
7453 (out+j)[1] = (ushort)in[i];
7465 ushort *out = (ushort*)_out;
7469 (out+j)[0] = (ushort)in[i+1];
7470 (out+j)[1] = (ushort)in[i];
7472 (out+j)[0] = (ushort)in[start];
7473 (out+j)[1] = (ushort)in[i];
7484 ushort *out = (ushort*)_out;
7488 (out+j)[0] = (ushort)in[i+1];
7489 (out+j)[1] = (ushort)in[i+2];
7490 (out+j)[2] = (ushort)in[i];
7502 ushort *out = (ushort*)_out;
7506 (out+j)[0] = (ushort)in[i+1+(i&1)];
7507 (out+j)[1] = (ushort)in[i+2-(i&1)];
7508 (out+j)[2] = (ushort)in[i];
7520 ushort *out = (ushort*)_out;
7524 (out+j)[0] = (ushort)in[i+1];
7525 (out+j)[1] = (ushort)in[i+2];
7526 (out+j)[2] = (ushort)in[start];
7538 ushort *out = (ushort*)_out;
7542 (out+j+0)[0] = (ushort)in[i+1];
7543 (out+j+0)[1] = (ushort)in[i+2];
7544 (out+j+0)[2] = (ushort)in[i+0];
7545 (out+j+3)[0] = (ushort)in[i+2];
7546 (out+j+3)[1] = (ushort)in[i+3];
7547 (out+j+3)[2] = (ushort)in[i+0];
7559 ushort *out = (ushort*)_out;
7563 (out+j+0)[0] = (ushort)in[i+1];
7564 (out+j+0)[1] = (ushort)in[i+3];
7565 (out+j+0)[2] = (ushort)in[i+0];
7566 (out+j+3)[0] = (ushort)in[i+3];
7567 (out+j+3)[1] = (ushort)in[i+2];
7568 (out+j+3)[2] = (ushort)in[i+0];
7580 ushort *out = (ushort*)_out;
7584 (out+j)[0] = (ushort)in[i+1];
7585 (out+j)[1] = (ushort)in[i+2];
7586 (out+j)[2] = (ushort)in[start];
7598 ushort *out = (ushort*)_out;
7602 (out+j)[0] = (ushort)in[i+3];
7603 (out+j)[1] = (ushort)in[i+2];
7604 (out+j)[2] = (ushort)in[i+1];
7605 (out+j)[3] = (ushort)in[i+0];
7617 ushort *out = (ushort*)_out;
7621 (out+j)[0] = (ushort)in[i+3];
7622 (out+j)[1] = (ushort)in[i+2];
7623 (out+j)[2] = (ushort)in[i+1];
7624 (out+j)[3] = (ushort)in[i+0];
7636 ushort *out = (ushort*)_out;
7640 (out+j)[0] = (ushort)in[i+4];
7641 (out+j)[1] = (ushort)in[i+5];
7642 (out+j)[2] = (ushort)in[i+0];
7643 (out+j)[3] = (ushort)in[i+1];
7644 (out+j)[4] = (ushort)in[i+2];
7645 (out+j)[5] = (ushort)in[i+3];
7657 ushort *out = (ushort*)_out;
7663 (out+j)[0] = (ushort)in[i+4];
7664 (out+j)[1] = (ushort)in[i+5];
7665 (out+j)[2] = (ushort)in[i+0];
7666 (out+j)[3] = (ushort)in[i+1];
7667 (out+j)[4] = (ushort)in[i+2];
7668 (out+j)[5] = (ushort)in[i+3];
7671 (out+j)[0] = (ushort)in[i+4];
7672 (out+j)[1] = (ushort)in[i+6];
7673 (out+j)[2] = (ushort)in[i+2];
7674 (out+j)[3] = (ushort)in[i-2];
7675 (out+j)[4] = (ushort)in[i+0];
7676 (out+j)[5] = (ushort)in[i+3];
7689 ushort *out = (ushort*)_out;
7693 (out+j)[0] = (ushort)in[i];
7705 ushort *out = (ushort*)_out;
7709 (out+j)[0] = (ushort)in[i+1];
7710 (out+j)[1] = (ushort)in[i];
7722 ushort *out = (ushort*)_out;
7726 (out+j)[0] = (ushort)in[i+1];
7727 (out+j)[1] = (ushort)in[i];
7739 ushort *out = (ushort*)_out;
7743 (out+j)[0] = (ushort)in[i+1];
7744 (out+j)[1] = (ushort)in[i];
7746 (out+j)[0] = (ushort)in[start];
7747 (out+j)[1] = (ushort)in[i];
7758 ushort *out = (ushort*)_out;
7762 (out+j)[0] = (ushort)in[i+1];
7763 (out+j)[1] = (ushort)in[i+2];
7764 (out+j)[2] = (ushort)in[i];
7776 ushort *out = (ushort*)_out;
7780 (out+j)[0] = (ushort)in[i+1+(i&1)];
7781 (out+j)[1] = (ushort)in[i+2-(i&1)];
7782 (out+j)[2] = (ushort)in[i];
7794 ushort *out = (ushort*)_out;
7798 (out+j)[0] = (ushort)in[i+1];
7799 (out+j)[1] = (ushort)in[i+2];
7800 (out+j)[2] = (ushort)in[start];
7812 ushort *out = (ushort*)_out;
7818 (out+j+0)[0] = restart_index;
7819 (out+j+0)[1] = restart_index;
7820 (out+j+0)[2] = restart_index;
7821 (out+j+3)[0] = restart_index;
7822 (out+j+3)[1] = restart_index;
7823 (out+j+3)[2] = restart_index;
7842 (out+j+0)[0] = (ushort)in[i+1];
7843 (out+j+0)[1] = (ushort)in[i+2];
7844 (out+j+0)[2] = (ushort)in[i+0];
7845 (out+j+3)[0] = (ushort)in[i+2];
7846 (out+j+3)[1] = (ushort)in[i+3];
7847 (out+j+3)[2] = (ushort)in[i+0];
7859 ushort *out = (ushort*)_out;
7865 (out+j+0)[0] = restart_index;
7866 (out+j+0)[1] = restart_index;
7867 (out+j+0)[2] = restart_index;
7868 (out+j+3)[0] = restart_index;
7869 (out+j+3)[1] = restart_index;
7870 (out+j+3)[2] = restart_index;
7889 (out+j+0)[0] = (ushort)in[i+1];
7890 (out+j+0)[1] = (ushort)in[i+3];
7891 (out+j+0)[2] = (ushort)in[i+0];
7892 (out+j+3)[0] = (ushort)in[i+3];
7893 (out+j+3)[1] = (ushort)in[i+2];
7894 (out+j+3)[2] = (ushort)in[i+0];
7906 ushort *out = (ushort*)_out;
7912 (out+j+0)[0] = restart_index;
7913 (out+j+0)[1] = restart_index;
7914 (out+j+0)[2] = restart_index;
7932 (out+j)[0] = (ushort)in[i+1];
7933 (out+j)[1] = (ushort)in[i+2];
7934 (out+j)[2] = (ushort)in[start];
7946 ushort *out = (ushort*)_out;
7950 (out+j)[0] = (ushort)in[i+3];
7951 (out+j)[1] = (ushort)in[i+2];
7952 (out+j)[2] = (ushort)in[i+1];
7953 (out+j)[3] = (ushort)in[i+0];
7965 ushort *out = (ushort*)_out;
7969 (out+j)[0] = (ushort)in[i+3];
7970 (out+j)[1] = (ushort)in[i+2];
7971 (out+j)[2] = (ushort)in[i+1];
7972 (out+j)[3] = (ushort)in[i+0];
7984 ushort *out = (ushort*)_out;
7988 (out+j)[0] = (ushort)in[i+4];
7989 (out+j)[1] = (ushort)in[i+5];
7990 (out+j)[2] = (ushort)in[i+0];
7991 (out+j)[3] = (ushort)in[i+1];
7992 (out+j)[4] = (ushort)in[i+2];
7993 (out+j)[5] = (ushort)in[i+3];
8005 ushort *out = (ushort*)_out;
8011 (out+j)[0] = (ushort)in[i+4];
8012 (out+j)[1] = (ushort)in[i+5];
8013 (out+j)[2] = (ushort)in[i+0];
8014 (out+j)[3] = (ushort)in[i+1];
8015 (out+j)[4] = (ushort)in[i+2];
8016 (out+j)[5] = (ushort)in[i+3];
8019 (out+j)[0] = (ushort)in[i+4];
8020 (out+j)[1] = (ushort)in[i+6];
8021 (out+j)[2] = (ushort)in[i+2];
8022 (out+j)[3] = (ushort)in[i-2];
8023 (out+j)[4] = (ushort)in[i+0];
8024 (out+j)[5] = (ushort)in[i+3];
8037 ushort *out = (ushort*)_out;
8041 (out+j)[0] = (ushort)in[i];
8053 ushort *out = (ushort*)_out;
8057 (out+j)[0] = (ushort)in[i+1];
8058 (out+j)[1] = (ushort)in[i];
8070 ushort *out = (ushort*)_out;
8074 (out+j)[0] = (ushort)in[i+1];
8075 (out+j)[1] = (ushort)in[i];
8087 ushort *out = (ushort*)_out;
8091 (out+j)[0] = (ushort)in[i+1];
8092 (out+j)[1] = (ushort)in[i];
8094 (out+j)[0] = (ushort)in[start];
8095 (out+j)[1] = (ushort)in[i];
8106 ushort *out = (ushort*)_out;
8110 (out+j)[0] = (ushort)in[i+2];
8111 (out+j)[1] = (ushort)in[i];
8112 (out+j)[2] = (ushort)in[i+1];
8124 ushort *out = (ushort*)_out;
8128 (out+j)[0] = (ushort)in[i+2];
8129 (out+j)[1] = (ushort)in[i+(i&1)];
8130 (out+j)[2] = (ushort)in[i+1-(i&1)];
8142 ushort *out = (ushort*)_out;
8146 (out+j)[0] = (ushort)in[i+2];
8147 (out+j)[1] = (ushort)in[start];
8148 (out+j)[2] = (ushort)in[i+1];
8160 ushort *out = (ushort*)_out;
8164 (out+j+0)[0] = (ushort)in[i+3];
8165 (out+j+0)[1] = (ushort)in[i+0];
8166 (out+j+0)[2] = (ushort)in[i+1];
8167 (out+j+3)[0] = (ushort)in[i+3];
8168 (out+j+3)[1] = (ushort)in[i+1];
8169 (out+j+3)[2] = (ushort)in[i+2];
8181 ushort *out = (ushort*)_out;
8185 (out+j+0)[0] = (ushort)in[i+3];
8186 (out+j+0)[1] = (ushort)in[i+2];
8187 (out+j+0)[2] = (ushort)in[i+0];
8188 (out+j+3)[0] = (ushort)in[i+3];
8189 (out+j+3)[1] = (ushort)in[i+0];
8190 (out+j+3)[2] = (ushort)in[i+1];
8202 ushort *out = (ushort*)_out;
8206 (out+j)[0] = (ushort)in[start];
8207 (out+j)[1] = (ushort)in[i+1];
8208 (out+j)[2] = (ushort)in[i+2];
8220 ushort *out = (ushort*)_out;
8224 (out+j)[0] = (ushort)in[i+3];
8225 (out+j)[1] = (ushort)in[i+2];
8226 (out+j)[2] = (ushort)in[i+1];
8227 (out+j)[3] = (ushort)in[i+0];
8239 ushort *out = (ushort*)_out;
8243 (out+j)[0] = (ushort)in[i+3];
8244 (out+j)[1] = (ushort)in[i+2];
8245 (out+j)[2] = (ushort)in[i+1];
8246 (out+j)[3] = (ushort)in[i+0];
8258 ushort *out = (ushort*)_out;
8262 (out+j)[0] = (ushort)in[i+4];
8263 (out+j)[1] = (ushort)in[i+5];
8264 (out+j)[2] = (ushort)in[i+0];
8265 (out+j)[3] = (ushort)in[i+1];
8266 (out+j)[4] = (ushort)in[i+2];
8267 (out+j)[5] = (ushort)in[i+3];
8279 ushort *out = (ushort*)_out;
8285 (out+j)[0] = (ushort)in[i+4];
8286 (out+j)[1] = (ushort)in[i+5];
8287 (out+j)[2] = (ushort)in[i+0];
8288 (out+j)[3] = (ushort)in[i+1];
8289 (out+j)[4] = (ushort)in[i+2];
8290 (out+j)[5] = (ushort)in[i+3];
8293 (out+j)[0] = (ushort)in[i+4];
8294 (out+j)[1] = (ushort)in[i+6];
8295 (out+j)[2] = (ushort)in[i+2];
8296 (out+j)[3] = (ushort)in[i-2];
8297 (out+j)[4] = (ushort)in[i+0];
8298 (out+j)[5] = (ushort)in[i+3];
8311 ushort *out = (ushort*)_out;
8315 (out+j)[0] = (ushort)in[i];
8327 ushort *out = (ushort*)_out;
8331 (out+j)[0] = (ushort)in[i+1];
8332 (out+j)[1] = (ushort)in[i];
8344 ushort *out = (ushort*)_out;
8348 (out+j)[0] = (ushort)in[i+1];
8349 (out+j)[1] = (ushort)in[i];
8361 ushort *out = (ushort*)_out;
8365 (out+j)[0] = (ushort)in[i+1];
8366 (out+j)[1] = (ushort)in[i];
8368 (out+j)[0] = (ushort)in[start];
8369 (out+j)[1] = (ushort)in[i];
8380 ushort *out = (ushort*)_out;
8384 (out+j)[0] = (ushort)in[i+2];
8385 (out+j)[1] = (ushort)in[i];
8386 (out+j)[2] = (ushort)in[i+1];
8398 ushort *out = (ushort*)_out;
8402 (out+j)[0] = (ushort)in[i+2];
8403 (out+j)[1] = (ushort)in[i+(i&1)];
8404 (out+j)[2] = (ushort)in[i+1-(i&1)];
8416 ushort *out = (ushort*)_out;
8420 (out+j)[0] = (ushort)in[i+2];
8421 (out+j)[1] = (ushort)in[start];
8422 (out+j)[2] = (ushort)in[i+1];
8434 ushort *out = (ushort*)_out;
8440 (out+j+0)[0] = restart_index;
8441 (out+j+0)[1] = restart_index;
8442 (out+j+0)[2] = restart_index;
8443 (out+j+3)[0] = restart_index;
8444 (out+j+3)[1] = restart_index;
8445 (out+j+3)[2] = restart_index;
8464 (out+j+0)[0] = (ushort)in[i+3];
8465 (out+j+0)[1] = (ushort)in[i+0];
8466 (out+j+0)[2] = (ushort)in[i+1];
8467 (out+j+3)[0] = (ushort)in[i+3];
8468 (out+j+3)[1] = (ushort)in[i+1];
8469 (out+j+3)[2] = (ushort)in[i+2];
8481 ushort *out = (ushort*)_out;
8487 (out+j+0)[0] = restart_index;
8488 (out+j+0)[1] = restart_index;
8489 (out+j+0)[2] = restart_index;
8490 (out+j+3)[0] = restart_index;
8491 (out+j+3)[1] = restart_index;
8492 (out+j+3)[2] = restart_index;
8511 (out+j+0)[0] = (ushort)in[i+3];
8512 (out+j+0)[1] = (ushort)in[i+2];
8513 (out+j+0)[2] = (ushort)in[i+0];
8514 (out+j+3)[0] = (ushort)in[i+3];
8515 (out+j+3)[1] = (ushort)in[i+0];
8516 (out+j+3)[2] = (ushort)in[i+1];
8528 ushort *out = (ushort*)_out;
8534 (out+j+0)[0] = restart_index;
8535 (out+j+0)[1] = restart_index;
8536 (out+j+0)[2] = restart_index;
8554 (out+j)[0] = (ushort)in[start];
8555 (out+j)[1] = (ushort)in[i+1];
8556 (out+j)[2] = (ushort)in[i+2];
8568 ushort *out = (ushort*)_out;
8572 (out+j)[0] = (ushort)in[i+3];
8573 (out+j)[1] = (ushort)in[i+2];
8574 (out+j)[2] = (ushort)in[i+1];
8575 (out+j)[3] = (ushort)in[i+0];
8587 ushort *out = (ushort*)_out;
8591 (out+j)[0] = (ushort)in[i+3];
8592 (out+j)[1] = (ushort)in[i+2];
8593 (out+j)[2] = (ushort)in[i+1];
8594 (out+j)[3] = (ushort)in[i+0];
8606 ushort *out = (ushort*)_out;
8610 (out+j)[0] = (ushort)in[i+4];
8611 (out+j)[1] = (ushort)in[i+5];
8612 (out+j)[2] = (ushort)in[i+0];
8613 (out+j)[3] = (ushort)in[i+1];
8614 (out+j)[4] = (ushort)in[i+2];
8615 (out+j)[5] = (ushort)in[i+3];
8627 ushort *out = (ushort*)_out;
8633 (out+j)[0] = (ushort)in[i+4];
8634 (out+j)[1] = (ushort)in[i+5];
8635 (out+j)[2] = (ushort)in[i+0];
8636 (out+j)[3] = (ushort)in[i+1];
8637 (out+j)[4] = (ushort)in[i+2];
8638 (out+j)[5] = (ushort)in[i+3];
8641 (out+j)[0] = (ushort)in[i+4];
8642 (out+j)[1] = (ushort)in[i+6];
8643 (out+j)[2] = (ushort)in[i+2];
8644 (out+j)[3] = (ushort)in[i-2];
8645 (out+j)[4] = (ushort)in[i+0];
8646 (out+j)[5] = (ushort)in[i+3];
8659 ushort *out = (ushort*)_out;
8663 (out+j)[0] = (ushort)in[i];
8675 ushort *out = (ushort*)_out;
8679 (out+j)[0] = (ushort)in[i];
8680 (out+j)[1] = (ushort)in[i+1];
8692 ushort *out = (ushort*)_out;
8696 (out+j)[0] = (ushort)in[i];
8697 (out+j)[1] = (ushort)in[i+1];
8709 ushort *out = (ushort*)_out;
8713 (out+j)[0] = (ushort)in[i];
8714 (out+j)[1] = (ushort)in[i+1];
8716 (out+j)[0] = (ushort)in[i];
8717 (out+j)[1] = (ushort)in[start];
8728 ushort *out = (ushort*)_out;
8732 (out+j)[0] = (ushort)in[i];
8733 (out+j)[1] = (ushort)in[i+1];
8734 (out+j)[2] = (ushort)in[i+2];
8746 ushort *out = (ushort*)_out;
8750 (out+j)[0] = (ushort)in[i+(i&1)];
8751 (out+j)[1] = (ushort)in[i+1-(i&1)];
8752 (out+j)[2] = (ushort)in[i+2];
8764 ushort *out = (ushort*)_out;
8768 (out+j)[0] = (ushort)in[start];
8769 (out+j)[1] = (ushort)in[i+1];
8770 (out+j)[2] = (ushort)in[i+2];
8782 ushort *out = (ushort*)_out;
8786 (out+j+0)[0] = (ushort)in[i+0];
8787 (out+j+0)[1] = (ushort)in[i+1];
8788 (out+j+0)[2] = (ushort)in[i+3];
8789 (out+j+3)[0] = (ushort)in[i+1];
8790 (out+j+3)[1] = (ushort)in[i+2];
8791 (out+j+3)[2] = (ushort)in[i+3];
8803 ushort *out = (ushort*)_out;
8807 (out+j+0)[0] = (ushort)in[i+2];
8808 (out+j+0)[1] = (ushort)in[i+0];
8809 (out+j+0)[2] = (ushort)in[i+3];
8810 (out+j+3)[0] = (ushort)in[i+0];
8811 (out+j+3)[1] = (ushort)in[i+1];
8812 (out+j+3)[2] = (ushort)in[i+3];
8824 ushort *out = (ushort*)_out;
8828 (out+j)[0] = (ushort)in[i+1];
8829 (out+j)[1] = (ushort)in[i+2];
8830 (out+j)[2] = (ushort)in[start];
8842 ushort *out = (ushort*)_out;
8846 (out+j)[0] = (ushort)in[i+0];
8847 (out+j)[1] = (ushort)in[i+1];
8848 (out+j)[2] = (ushort)in[i+2];
8849 (out+j)[3] = (ushort)in[i+3];
8861 ushort *out = (ushort*)_out;
8865 (out+j)[0] = (ushort)in[i+0];
8866 (out+j)[1] = (ushort)in[i+1];
8867 (out+j)[2] = (ushort)in[i+2];
8868 (out+j)[3] = (ushort)in[i+3];
8880 ushort *out = (ushort*)_out;
8884 (out+j)[0] = (ushort)in[i+0];
8885 (out+j)[1] = (ushort)in[i+1];
8886 (out+j)[2] = (ushort)in[i+2];
8887 (out+j)[3] = (ushort)in[i+3];
8888 (out+j)[4] = (ushort)in[i+4];
8889 (out+j)[5] = (ushort)in[i+5];
8901 ushort *out = (ushort*)_out;
8907 (out+j)[0] = (ushort)in[i+0];
8908 (out+j)[1] = (ushort)in[i+1];
8909 (out+j)[2] = (ushort)in[i+2];
8910 (out+j)[3] = (ushort)in[i+3];
8911 (out+j)[4] = (ushort)in[i+4];
8912 (out+j)[5] = (ushort)in[i+5];
8915 (out+j)[0] = (ushort)in[i+2];
8916 (out+j)[1] = (ushort)in[i-2];
8917 (out+j)[2] = (ushort)in[i+0];
8918 (out+j)[3] = (ushort)in[i+3];
8919 (out+j)[4] = (ushort)in[i+4];
8920 (out+j)[5] = (ushort)in[i+6];
8933 ushort *out = (ushort*)_out;
8937 (out+j)[0] = (ushort)in[i];
8949 ushort *out = (ushort*)_out;
8953 (out+j)[0] = (ushort)in[i];
8954 (out+j)[1] = (ushort)in[i+1];
8966 ushort *out = (ushort*)_out;
8970 (out+j)[0] = (ushort)in[i];
8971 (out+j)[1] = (ushort)in[i+1];
8983 ushort *out = (ushort*)_out;
8987 (out+j)[0] = (ushort)in[i];
8988 (out+j)[1] = (ushort)in[i+1];
8990 (out+j)[0] = (ushort)in[i];
8991 (out+j)[1] = (ushort)in[start];
9002 ushort *out = (ushort*)_out;
9006 (out+j)[0] = (ushort)in[i];
9007 (out+j)[1] = (ushort)in[i+1];
9008 (out+j)[2] = (ushort)in[i+2];
9020 ushort *out = (ushort*)_out;
9024 (out+j)[0] = (ushort)in[i+(i&1)];
9025 (out+j)[1] = (ushort)in[i+1-(i&1)];
9026 (out+j)[2] = (ushort)in[i+2];
9038 ushort *out = (ushort*)_out;
9042 (out+j)[0] = (ushort)in[start];
9043 (out+j)[1] = (ushort)in[i+1];
9044 (out+j)[2] = (ushort)in[i+2];
9056 ushort *out = (ushort*)_out;
9062 (out+j+0)[0] = restart_index;
9063 (out+j+0)[1] = restart_index;
9064 (out+j+0)[2] = restart_index;
9065 (out+j+3)[0] = restart_index;
9066 (out+j+3)[1] = restart_index;
9067 (out+j+3)[2] = restart_index;
9086 (out+j+0)[0] = (ushort)in[i+0];
9087 (out+j+0)[1] = (ushort)in[i+1];
9088 (out+j+0)[2] = (ushort)in[i+3];
9089 (out+j+3)[0] = (ushort)in[i+1];
9090 (out+j+3)[1] = (ushort)in[i+2];
9091 (out+j+3)[2] = (ushort)in[i+3];
9103 ushort *out = (ushort*)_out;
9109 (out+j+0)[0] = restart_index;
9110 (out+j+0)[1] = restart_index;
9111 (out+j+0)[2] = restart_index;
9112 (out+j+3)[0] = restart_index;
9113 (out+j+3)[1] = restart_index;
9114 (out+j+3)[2] = restart_index;
9133 (out+j+0)[0] = (ushort)in[i+2];
9134 (out+j+0)[1] = (ushort)in[i+0];
9135 (out+j+0)[2] = (ushort)in[i+3];
9136 (out+j+3)[0] = (ushort)in[i+0];
9137 (out+j+3)[1] = (ushort)in[i+1];
9138 (out+j+3)[2] = (ushort)in[i+3];
9150 ushort *out = (ushort*)_out;
9156 (out+j+0)[0] = restart_index;
9157 (out+j+0)[1] = restart_index;
9158 (out+j+0)[2] = restart_index;
9176 (out+j)[0] = (ushort)in[i+1];
9177 (out+j)[1] = (ushort)in[i+2];
9178 (out+j)[2] = (ushort)in[start];
9190 ushort *out = (ushort*)_out;
9194 (out+j)[0] = (ushort)in[i+0];
9195 (out+j)[1] = (ushort)in[i+1];
9196 (out+j)[2] = (ushort)in[i+2];
9197 (out+j)[3] = (ushort)in[i+3];
9209 ushort *out = (ushort*)_out;
9213 (out+j)[0] = (ushort)in[i+0];
9214 (out+j)[1] = (ushort)in[i+1];
9215 (out+j)[2] = (ushort)in[i+2];
9216 (out+j)[3] = (ushort)in[i+3];
9228 ushort *out = (ushort*)_out;
9232 (out+j)[0] = (ushort)in[i+0];
9233 (out+j)[1] = (ushort)in[i+1];
9234 (out+j)[2] = (ushort)in[i+2];
9235 (out+j)[3] = (ushort)in[i+3];
9236 (out+j)[4] = (ushort)in[i+4];
9237 (out+j)[5] = (ushort)in[i+5];
9249 ushort *out = (ushort*)_out;
9255 (out+j)[0] = (ushort)in[i+0];
9256 (out+j)[1] = (ushort)in[i+1];
9257 (out+j)[2] = (ushort)in[i+2];
9258 (out+j)[3] = (ushort)in[i+3];
9259 (out+j)[4] = (ushort)in[i+4];
9260 (out+j)[5] = (ushort)in[i+5];
9263 (out+j)[0] = (ushort)in[i+2];
9264 (out+j)[1] = (ushort)in[i-2];
9265 (out+j)[2] = (ushort)in[i+0];
9266 (out+j)[3] = (ushort)in[i+3];
9267 (out+j)[4] = (ushort)in[i+4];
9268 (out+j)[5] = (ushort)in[i+6];
9281 uint *out = (uint*)_out;
9285 (out+j)[0] = (uint)in[i];
9297 uint *out = (uint*)_out;
9301 (out+j)[0] = (uint)in[i];
9302 (out+j)[1] = (uint)in[i+1];
9314 uint *out = (uint*)_out;
9318 (out+j)[0] = (uint)in[i];
9319 (out+j)[1] = (uint)in[i+1];
9331 uint *out = (uint*)_out;
9335 (out+j)[0] = (uint)in[i];
9336 (out+j)[1] = (uint)in[i+1];
9338 (out+j)[0] = (uint)in[i];
9339 (out+j)[1] = (uint)in[start];
9350 uint *out = (uint*)_out;
9354 (out+j)[0] = (uint)in[i];
9355 (out+j)[1] = (uint)in[i+1];
9356 (out+j)[2] = (uint)in[i+2];
9368 uint *out = (uint*)_out;
9372 (out+j)[0] = (uint)in[i];
9373 (out+j)[1] = (uint)in[i+1+(i&1)];
9374 (out+j)[2] = (uint)in[i+2-(i&1)];
9386 uint *out = (uint*)_out;
9390 (out+j)[0] = (uint)in[start];
9391 (out+j)[1] = (uint)in[i+1];
9392 (out+j)[2] = (uint)in[i+2];
9404 uint *out = (uint*)_out;
9408 (out+j+0)[0] = (uint)in[i+0];
9409 (out+j+0)[1] = (uint)in[i+1];
9410 (out+j+0)[2] = (uint)in[i+2];
9411 (out+j+3)[0] = (uint)in[i+0];
9412 (out+j+3)[1] = (uint)in[i+2];
9413 (out+j+3)[2] = (uint)in[i+3];
9425 uint *out = (uint*)_out;
9429 (out+j+0)[0] = (uint)in[i+0];
9430 (out+j+0)[1] = (uint)in[i+1];
9431 (out+j+0)[2] = (uint)in[i+3];
9432 (out+j+3)[0] = (uint)in[i+0];
9433 (out+j+3)[1] = (uint)in[i+3];
9434 (out+j+3)[2] = (uint)in[i+2];
9446 uint *out = (uint*)_out;
9450 (out+j)[0] = (uint)in[start];
9451 (out+j)[1] = (uint)in[i+1];
9452 (out+j)[2] = (uint)in[i+2];
9464 uint *out = (uint*)_out;
9468 (out+j)[0] = (uint)in[i+0];
9469 (out+j)[1] = (uint)in[i+1];
9470 (out+j)[2] = (uint)in[i+2];
9471 (out+j)[3] = (uint)in[i+3];
9483 uint *out = (uint*)_out;
9487 (out+j)[0] = (uint)in[i+0];
9488 (out+j)[1] = (uint)in[i+1];
9489 (out+j)[2] = (uint)in[i+2];
9490 (out+j)[3] = (uint)in[i+3];
9502 uint *out = (uint*)_out;
9506 (out+j)[0] = (uint)in[i+0];
9507 (out+j)[1] = (uint)in[i+1];
9508 (out+j)[2] = (uint)in[i+2];
9509 (out+j)[3] = (uint)in[i+3];
9510 (out+j)[4] = (uint)in[i+4];
9511 (out+j)[5] = (uint)in[i+5];
9523 uint *out = (uint*)_out;
9529 (out+j)[0] = (uint)in[i+0];
9530 (out+j)[1] = (uint)in[i+1];
9531 (out+j)[2] = (uint)in[i+2];
9532 (out+j)[3] = (uint)in[i+3];
9533 (out+j)[4] = (uint)in[i+4];
9534 (out+j)[5] = (uint)in[i+5];
9537 (out+j)[0] = (uint)in[i+2];
9538 (out+j)[1] = (uint)in[i-2];
9539 (out+j)[2] = (uint)in[i+0];
9540 (out+j)[3] = (uint)in[i+3];
9541 (out+j)[4] = (uint)in[i+4];
9542 (out+j)[5] = (uint)in[i+6];
9555 uint *out = (uint*)_out;
9559 (out+j)[0] = (uint)in[i];
9571 uint *out = (uint*)_out;
9575 (out+j)[0] = (uint)in[i];
9576 (out+j)[1] = (uint)in[i+1];
9588 uint *out = (uint*)_out;
9592 (out+j)[0] = (uint)in[i];
9593 (out+j)[1] = (uint)in[i+1];
9605 uint *out = (uint*)_out;
9609 (out+j)[0] = (uint)in[i];
9610 (out+j)[1] = (uint)in[i+1];
9612 (out+j)[0] = (uint)in[i];
9613 (out+j)[1] = (uint)in[start];
9624 uint *out = (uint*)_out;
9628 (out+j)[0] = (uint)in[i];
9629 (out+j)[1] = (uint)in[i+1];
9630 (out+j)[2] = (uint)in[i+2];
9642 uint *out = (uint*)_out;
9646 (out+j)[0] = (uint)in[i];
9647 (out+j)[1] = (uint)in[i+1+(i&1)];
9648 (out+j)[2] = (uint)in[i+2-(i&1)];
9660 uint *out = (uint*)_out;
9664 (out+j)[0] = (uint)in[start];
9665 (out+j)[1] = (uint)in[i+1];
9666 (out+j)[2] = (uint)in[i+2];
9678 uint *out = (uint*)_out;
9684 (out+j+0)[0] = restart_index;
9685 (out+j+0)[1] = restart_index;
9686 (out+j+0)[2] = restart_index;
9687 (out+j+3)[0] = restart_index;
9688 (out+j+3)[1] = restart_index;
9689 (out+j+3)[2] = restart_index;
9708 (out+j+0)[0] = (uint)in[i+0];
9709 (out+j+0)[1] = (uint)in[i+1];
9710 (out+j+0)[2] = (uint)in[i+2];
9711 (out+j+3)[0] = (uint)in[i+0];
9712 (out+j+3)[1] = (uint)in[i+2];
9713 (out+j+3)[2] = (uint)in[i+3];
9725 uint *out = (uint*)_out;
9731 (out+j+0)[0] = restart_index;
9732 (out+j+0)[1] = restart_index;
9733 (out+j+0)[2] = restart_index;
9734 (out+j+3)[0] = restart_index;
9735 (out+j+3)[1] = restart_index;
9736 (out+j+3)[2] = restart_index;
9755 (out+j+0)[0] = (uint)in[i+0];
9756 (out+j+0)[1] = (uint)in[i+1];
9757 (out+j+0)[2] = (uint)in[i+3];
9758 (out+j+3)[0] = (uint)in[i+0];
9759 (out+j+3)[1] = (uint)in[i+3];
9760 (out+j+3)[2] = (uint)in[i+2];
9772 uint *out = (uint*)_out;
9778 (out+j+0)[0] = restart_index;
9779 (out+j+0)[1] = restart_index;
9780 (out+j+0)[2] = restart_index;
9798 (out+j)[0] = (uint)in[start];
9799 (out+j)[1] = (uint)in[i+1];
9800 (out+j)[2] = (uint)in[i+2];
9812 uint *out = (uint*)_out;
9816 (out+j)[0] = (uint)in[i+0];
9817 (out+j)[1] = (uint)in[i+1];
9818 (out+j)[2] = (uint)in[i+2];
9819 (out+j)[3] = (uint)in[i+3];
9831 uint *out = (uint*)_out;
9835 (out+j)[0] = (uint)in[i+0];
9836 (out+j)[1] = (uint)in[i+1];
9837 (out+j)[2] = (uint)in[i+2];
9838 (out+j)[3] = (uint)in[i+3];
9850 uint *out = (uint*)_out;
9854 (out+j)[0] = (uint)in[i+0];
9855 (out+j)[1] = (uint)in[i+1];
9856 (out+j)[2] = (uint)in[i+2];
9857 (out+j)[3] = (uint)in[i+3];
9858 (out+j)[4] = (uint)in[i+4];
9859 (out+j)[5] = (uint)in[i+5];
9871 uint *out = (uint*)_out;
9877 (out+j)[0] = (uint)in[i+0];
9878 (out+j)[1] = (uint)in[i+1];
9879 (out+j)[2] = (uint)in[i+2];
9880 (out+j)[3] = (uint)in[i+3];
9881 (out+j)[4] = (uint)in[i+4];
9882 (out+j)[5] = (uint)in[i+5];
9885 (out+j)[0] = (uint)in[i+2];
9886 (out+j)[1] = (uint)in[i-2];
9887 (out+j)[2] = (uint)in[i+0];
9888 (out+j)[3] = (uint)in[i+3];
9889 (out+j)[4] = (uint)in[i+4];
9890 (out+j)[5] = (uint)in[i+6];
9903 uint *out = (uint*)_out;
9907 (out+j)[0] = (uint)in[i];
9919 uint *out = (uint*)_out;
9923 (out+j)[0] = (uint)in[i+1];
9924 (out+j)[1] = (uint)in[i];
9936 uint *out = (uint*)_out;
9940 (out+j)[0] = (uint)in[i+1];
9941 (out+j)[1] = (uint)in[i];
9953 uint *out = (uint*)_out;
9957 (out+j)[0] = (uint)in[i+1];
9958 (out+j)[1] = (uint)in[i];
9960 (out+j)[0] = (uint)in[start];
9961 (out+j)[1] = (uint)in[i];
9972 uint *out = (uint*)_out;
9976 (out+j)[0] = (uint)in[i+1];
9977 (out+j)[1] = (uint)in[i+2];
9978 (out+j)[2] = (uint)in[i];
9990 uint *out = (uint*)_out;
9994 (out+j)[0] = (uint)in[i+1+(i&1)];
9995 (out+j)[1] = (uint)in[i+2-(i&1)];
9996 (out+j)[2] = (uint)in[i];
10008 uint *out = (uint*)_out;
10012 (out+j)[0] = (uint)in[i+1];
10013 (out+j)[1] = (uint)in[i+2];
10014 (out+j)[2] = (uint)in[start];
10026 uint *out = (uint*)_out;
10030 (out+j+0)[0] = (uint)in[i+1];
10031 (out+j+0)[1] = (uint)in[i+2];
10032 (out+j+0)[2] = (uint)in[i+0];
10033 (out+j+3)[0] = (uint)in[i+2];
10034 (out+j+3)[1] = (uint)in[i+3];
10035 (out+j+3)[2] = (uint)in[i+0];
10047 uint *out = (uint*)_out;
10051 (out+j+0)[0] = (uint)in[i+1];
10052 (out+j+0)[1] = (uint)in[i+3];
10053 (out+j+0)[2] = (uint)in[i+0];
10054 (out+j+3)[0] = (uint)in[i+3];
10055 (out+j+3)[1] = (uint)in[i+2];
10056 (out+j+3)[2] = (uint)in[i+0];
10068 uint *out = (uint*)_out;
10072 (out+j)[0] = (uint)in[i+1];
10073 (out+j)[1] = (uint)in[i+2];
10074 (out+j)[2] = (uint)in[start];
10086 uint *out = (uint*)_out;
10090 (out+j)[0] = (uint)in[i+3];
10091 (out+j)[1] = (uint)in[i+2];
10092 (out+j)[2] = (uint)in[i+1];
10093 (out+j)[3] = (uint)in[i+0];
10105 uint *out = (uint*)_out;
10109 (out+j)[0] = (uint)in[i+3];
10110 (out+j)[1] = (uint)in[i+2];
10111 (out+j)[2] = (uint)in[i+1];
10112 (out+j)[3] = (uint)in[i+0];
10124 uint *out = (uint*)_out;
10128 (out+j)[0] = (uint)in[i+4];
10129 (out+j)[1] = (uint)in[i+5];
10130 (out+j)[2] = (uint)in[i+0];
10131 (out+j)[3] = (uint)in[i+1];
10132 (out+j)[4] = (uint)in[i+2];
10133 (out+j)[5] = (uint)in[i+3];
10145 uint *out = (uint*)_out;
10151 (out+j)[0] = (uint)in[i+4];
10152 (out+j)[1] = (uint)in[i+5];
10153 (out+j)[2] = (uint)in[i+0];
10154 (out+j)[3] = (uint)in[i+1];
10155 (out+j)[4] = (uint)in[i+2];
10156 (out+j)[5] = (uint)in[i+3];
10159 (out+j)[0] = (uint)in[i+4];
10160 (out+j)[1] = (uint)in[i+6];
10161 (out+j)[2] = (uint)in[i+2];
10162 (out+j)[3] = (uint)in[i-2];
10163 (out+j)[4] = (uint)in[i+0];
10164 (out+j)[5] = (uint)in[i+3];
10177 uint *out = (uint*)_out;
10181 (out+j)[0] = (uint)in[i];
10193 uint *out = (uint*)_out;
10197 (out+j)[0] = (uint)in[i+1];
10198 (out+j)[1] = (uint)in[i];
10210 uint *out = (uint*)_out;
10214 (out+j)[0] = (uint)in[i+1];
10215 (out+j)[1] = (uint)in[i];
10227 uint *out = (uint*)_out;
10231 (out+j)[0] = (uint)in[i+1];
10232 (out+j)[1] = (uint)in[i];
10234 (out+j)[0] = (uint)in[start];
10235 (out+j)[1] = (uint)in[i];
10246 uint *out = (uint*)_out;
10250 (out+j)[0] = (uint)in[i+1];
10251 (out+j)[1] = (uint)in[i+2];
10252 (out+j)[2] = (uint)in[i];
10264 uint *out = (uint*)_out;
10268 (out+j)[0] = (uint)in[i+1+(i&1)];
10269 (out+j)[1] = (uint)in[i+2-(i&1)];
10270 (out+j)[2] = (uint)in[i];
10282 uint *out = (uint*)_out;
10286 (out+j)[0] = (uint)in[i+1];
10287 (out+j)[1] = (uint)in[i+2];
10288 (out+j)[2] = (uint)in[start];
10300 uint *out = (uint*)_out;
10306 (out+j+0)[0] = restart_index;
10307 (out+j+0)[1] = restart_index;
10308 (out+j+0)[2] = restart_index;
10309 (out+j+3)[0] = restart_index;
10310 (out+j+3)[1] = restart_index;
10311 (out+j+3)[2] = restart_index;
10330 (out+j+0)[0] = (uint)in[i+1];
10331 (out+j+0)[1] = (uint)in[i+2];
10332 (out+j+0)[2] = (uint)in[i+0];
10333 (out+j+3)[0] = (uint)in[i+2];
10334 (out+j+3)[1] = (uint)in[i+3];
10335 (out+j+3)[2] = (uint)in[i+0];
10347 uint *out = (uint*)_out;
10353 (out+j+0)[0] = restart_index;
10354 (out+j+0)[1] = restart_index;
10355 (out+j+0)[2] = restart_index;
10356 (out+j+3)[0] = restart_index;
10357 (out+j+3)[1] = restart_index;
10358 (out+j+3)[2] = restart_index;
10377 (out+j+0)[0] = (uint)in[i+1];
10378 (out+j+0)[1] = (uint)in[i+3];
10379 (out+j+0)[2] = (uint)in[i+0];
10380 (out+j+3)[0] = (uint)in[i+3];
10381 (out+j+3)[1] = (uint)in[i+2];
10382 (out+j+3)[2] = (uint)in[i+0];
10394 uint *out = (uint*)_out;
10400 (out+j+0)[0] = restart_index;
10401 (out+j+0)[1] = restart_index;
10402 (out+j+0)[2] = restart_index;
10420 (out+j)[0] = (uint)in[i+1];
10421 (out+j)[1] = (uint)in[i+2];
10422 (out+j)[2] = (uint)in[start];
10434 uint *out = (uint*)_out;
10438 (out+j)[0] = (uint)in[i+3];
10439 (out+j)[1] = (uint)in[i+2];
10440 (out+j)[2] = (uint)in[i+1];
10441 (out+j)[3] = (uint)in[i+0];
10453 uint *out = (uint*)_out;
10457 (out+j)[0] = (uint)in[i+3];
10458 (out+j)[1] = (uint)in[i+2];
10459 (out+j)[2] = (uint)in[i+1];
10460 (out+j)[3] = (uint)in[i+0];
10472 uint *out = (uint*)_out;
10476 (out+j)[0] = (uint)in[i+4];
10477 (out+j)[1] = (uint)in[i+5];
10478 (out+j)[2] = (uint)in[i+0];
10479 (out+j)[3] = (uint)in[i+1];
10480 (out+j)[4] = (uint)in[i+2];
10481 (out+j)[5] = (uint)in[i+3];
10493 uint *out = (uint*)_out;
10499 (out+j)[0] = (uint)in[i+4];
10500 (out+j)[1] = (uint)in[i+5];
10501 (out+j)[2] = (uint)in[i+0];
10502 (out+j)[3] = (uint)in[i+1];
10503 (out+j)[4] = (uint)in[i+2];
10504 (out+j)[5] = (uint)in[i+3];
10507 (out+j)[0] = (uint)in[i+4];
10508 (out+j)[1] = (uint)in[i+6];
10509 (out+j)[2] = (uint)in[i+2];
10510 (out+j)[3] = (uint)in[i-2];
10511 (out+j)[4] = (uint)in[i+0];
10512 (out+j)[5] = (uint)in[i+3];
10525 uint *out = (uint*)_out;
10529 (out+j)[0] = (uint)in[i];
10541 uint *out = (uint*)_out;
10545 (out+j)[0] = (uint)in[i+1];
10546 (out+j)[1] = (uint)in[i];
10558 uint *out = (uint*)_out;
10562 (out+j)[0] = (uint)in[i+1];
10563 (out+j)[1] = (uint)in[i];
10575 uint *out = (uint*)_out;
10579 (out+j)[0] = (uint)in[i+1];
10580 (out+j)[1] = (uint)in[i];
10582 (out+j)[0] = (uint)in[start];
10583 (out+j)[1] = (uint)in[i];
10594 uint *out = (uint*)_out;
10598 (out+j)[0] = (uint)in[i+2];
10599 (out+j)[1] = (uint)in[i];
10600 (out+j)[2] = (uint)in[i+1];
10612 uint *out = (uint*)_out;
10616 (out+j)[0] = (uint)in[i+2];
10617 (out+j)[1] = (uint)in[i+(i&1)];
10618 (out+j)[2] = (uint)in[i+1-(i&1)];
10630 uint *out = (uint*)_out;
10634 (out+j)[0] = (uint)in[i+2];
10635 (out+j)[1] = (uint)in[start];
10636 (out+j)[2] = (uint)in[i+1];
10648 uint *out = (uint*)_out;
10652 (out+j+0)[0] = (uint)in[i+3];
10653 (out+j+0)[1] = (uint)in[i+0];
10654 (out+j+0)[2] = (uint)in[i+1];
10655 (out+j+3)[0] = (uint)in[i+3];
10656 (out+j+3)[1] = (uint)in[i+1];
10657 (out+j+3)[2] = (uint)in[i+2];
10669 uint *out = (uint*)_out;
10673 (out+j+0)[0] = (uint)in[i+3];
10674 (out+j+0)[1] = (uint)in[i+2];
10675 (out+j+0)[2] = (uint)in[i+0];
10676 (out+j+3)[0] = (uint)in[i+3];
10677 (out+j+3)[1] = (uint)in[i+0];
10678 (out+j+3)[2] = (uint)in[i+1];
10690 uint *out = (uint*)_out;
10694 (out+j)[0] = (uint)in[start];
10695 (out+j)[1] = (uint)in[i+1];
10696 (out+j)[2] = (uint)in[i+2];
10708 uint *out = (uint*)_out;
10712 (out+j)[0] = (uint)in[i+3];
10713 (out+j)[1] = (uint)in[i+2];
10714 (out+j)[2] = (uint)in[i+1];
10715 (out+j)[3] = (uint)in[i+0];
10727 uint *out = (uint*)_out;
10731 (out+j)[0] = (uint)in[i+3];
10732 (out+j)[1] = (uint)in[i+2];
10733 (out+j)[2] = (uint)in[i+1];
10734 (out+j)[3] = (uint)in[i+0];
10746 uint *out = (uint*)_out;
10750 (out+j)[0] = (uint)in[i+4];
10751 (out+j)[1] = (uint)in[i+5];
10752 (out+j)[2] = (uint)in[i+0];
10753 (out+j)[3] = (uint)in[i+1];
10754 (out+j)[4] = (uint)in[i+2];
10755 (out+j)[5] = (uint)in[i+3];
10767 uint *out = (uint*)_out;
10773 (out+j)[0] = (uint)in[i+4];
10774 (out+j)[1] = (uint)in[i+5];
10775 (out+j)[2] = (uint)in[i+0];
10776 (out+j)[3] = (uint)in[i+1];
10777 (out+j)[4] = (uint)in[i+2];
10778 (out+j)[5] = (uint)in[i+3];
10781 (out+j)[0] = (uint)in[i+4];
10782 (out+j)[1] = (uint)in[i+6];
10783 (out+j)[2] = (uint)in[i+2];
10784 (out+j)[3] = (uint)in[i-2];
10785 (out+j)[4] = (uint)in[i+0];
10786 (out+j)[5] = (uint)in[i+3];
10799 uint *out = (uint*)_out;
10803 (out+j)[0] = (uint)in[i];
10815 uint *out = (uint*)_out;
10819 (out+j)[0] = (uint)in[i+1];
10820 (out+j)[1] = (uint)in[i];
10832 uint *out = (uint*)_out;
10836 (out+j)[0] = (uint)in[i+1];
10837 (out+j)[1] = (uint)in[i];
10849 uint *out = (uint*)_out;
10853 (out+j)[0] = (uint)in[i+1];
10854 (out+j)[1] = (uint)in[i];
10856 (out+j)[0] = (uint)in[start];
10857 (out+j)[1] = (uint)in[i];
10868 uint *out = (uint*)_out;
10872 (out+j)[0] = (uint)in[i+2];
10873 (out+j)[1] = (uint)in[i];
10874 (out+j)[2] = (uint)in[i+1];
10886 uint *out = (uint*)_out;
10890 (out+j)[0] = (uint)in[i+2];
10891 (out+j)[1] = (uint)in[i+(i&1)];
10892 (out+j)[2] = (uint)in[i+1-(i&1)];
10904 uint *out = (uint*)_out;
10908 (out+j)[0] = (uint)in[i+2];
10909 (out+j)[1] = (uint)in[start];
10910 (out+j)[2] = (uint)in[i+1];
10922 uint *out = (uint*)_out;
10928 (out+j+0)[0] = restart_index;
10929 (out+j+0)[1] = restart_index;
10930 (out+j+0)[2] = restart_index;
10931 (out+j+3)[0] = restart_index;
10932 (out+j+3)[1] = restart_index;
10933 (out+j+3)[2] = restart_index;
10952 (out+j+0)[0] = (uint)in[i+3];
10953 (out+j+0)[1] = (uint)in[i+0];
10954 (out+j+0)[2] = (uint)in[i+1];
10955 (out+j+3)[0] = (uint)in[i+3];
10956 (out+j+3)[1] = (uint)in[i+1];
10957 (out+j+3)[2] = (uint)in[i+2];
10969 uint *out = (uint*)_out;
10975 (out+j+0)[0] = restart_index;
10976 (out+j+0)[1] = restart_index;
10977 (out+j+0)[2] = restart_index;
10978 (out+j+3)[0] = restart_index;
10979 (out+j+3)[1] = restart_index;
10980 (out+j+3)[2] = restart_index;
10999 (out+j+0)[0] = (uint)in[i+3];
11000 (out+j+0)[1] = (uint)in[i+2];
11001 (out+j+0)[2] = (uint)in[i+0];
11002 (out+j+3)[0] = (uint)in[i+3];
11003 (out+j+3)[1] = (uint)in[i+0];
11004 (out+j+3)[2] = (uint)in[i+1];
11016 uint *out = (uint*)_out;
11022 (out+j+0)[0] = restart_index;
11023 (out+j+0)[1] = restart_index;
11024 (out+j+0)[2] = restart_index;
11042 (out+j)[0] = (uint)in[start];
11043 (out+j)[1] = (uint)in[i+1];
11044 (out+j)[2] = (uint)in[i+2];
11056 uint *out = (uint*)_out;
11060 (out+j)[0] = (uint)in[i+3];
11061 (out+j)[1] = (uint)in[i+2];
11062 (out+j)[2] = (uint)in[i+1];
11063 (out+j)[3] = (uint)in[i+0];
11075 uint *out = (uint*)_out;
11079 (out+j)[0] = (uint)in[i+3];
11080 (out+j)[1] = (uint)in[i+2];
11081 (out+j)[2] = (uint)in[i+1];
11082 (out+j)[3] = (uint)in[i+0];
11094 uint *out = (uint*)_out;
11098 (out+j)[0] = (uint)in[i+4];
11099 (out+j)[1] = (uint)in[i+5];
11100 (out+j)[2] = (uint)in[i+0];
11101 (out+j)[3] = (uint)in[i+1];
11102 (out+j)[4] = (uint)in[i+2];
11103 (out+j)[5] = (uint)in[i+3];
11115 uint *out = (uint*)_out;
11121 (out+j)[0] = (uint)in[i+4];
11122 (out+j)[1] = (uint)in[i+5];
11123 (out+j)[2] = (uint)in[i+0];
11124 (out+j)[3] = (uint)in[i+1];
11125 (out+j)[4] = (uint)in[i+2];
11126 (out+j)[5] = (uint)in[i+3];
11129 (out+j)[0] = (uint)in[i+4];
11130 (out+j)[1] = (uint)in[i+6];
11131 (out+j)[2] = (uint)in[i+2];
11132 (out+j)[3] = (uint)in[i-2];
11133 (out+j)[4] = (uint)in[i+0];
11134 (out+j)[5] = (uint)in[i+3];
11147 uint *out = (uint*)_out;
11151 (out+j)[0] = (uint)in[i];
11163 uint *out = (uint*)_out;
11167 (out+j)[0] = (uint)in[i];
11168 (out+j)[1] = (uint)in[i+1];
11180 uint *out = (uint*)_out;
11184 (out+j)[0] = (uint)in[i];
11185 (out+j)[1] = (uint)in[i+1];
11197 uint *out = (uint*)_out;
11201 (out+j)[0] = (uint)in[i];
11202 (out+j)[1] = (uint)in[i+1];
11204 (out+j)[0] = (uint)in[i];
11205 (out+j)[1] = (uint)in[start];
11216 uint *out = (uint*)_out;
11220 (out+j)[0] = (uint)in[i];
11221 (out+j)[1] = (uint)in[i+1];
11222 (out+j)[2] = (uint)in[i+2];
11234 uint *out = (uint*)_out;
11238 (out+j)[0] = (uint)in[i+(i&1)];
11239 (out+j)[1] = (uint)in[i+1-(i&1)];
11240 (out+j)[2] = (uint)in[i+2];
11252 uint *out = (uint*)_out;
11256 (out+j)[0] = (uint)in[start];
11257 (out+j)[1] = (uint)in[i+1];
11258 (out+j)[2] = (uint)in[i+2];
11270 uint *out = (uint*)_out;
11274 (out+j+0)[0] = (uint)in[i+0];
11275 (out+j+0)[1] = (uint)in[i+1];
11276 (out+j+0)[2] = (uint)in[i+3];
11277 (out+j+3)[0] = (uint)in[i+1];
11278 (out+j+3)[1] = (uint)in[i+2];
11279 (out+j+3)[2] = (uint)in[i+3];
11291 uint *out = (uint*)_out;
11295 (out+j+0)[0] = (uint)in[i+2];
11296 (out+j+0)[1] = (uint)in[i+0];
11297 (out+j+0)[2] = (uint)in[i+3];
11298 (out+j+3)[0] = (uint)in[i+0];
11299 (out+j+3)[1] = (uint)in[i+1];
11300 (out+j+3)[2] = (uint)in[i+3];
11312 uint *out = (uint*)_out;
11316 (out+j)[0] = (uint)in[i+1];
11317 (out+j)[1] = (uint)in[i+2];
11318 (out+j)[2] = (uint)in[start];
11330 uint *out = (uint*)_out;
11334 (out+j)[0] = (uint)in[i+0];
11335 (out+j)[1] = (uint)in[i+1];
11336 (out+j)[2] = (uint)in[i+2];
11337 (out+j)[3] = (uint)in[i+3];
11349 uint *out = (uint*)_out;
11353 (out+j)[0] = (uint)in[i+0];
11354 (out+j)[1] = (uint)in[i+1];
11355 (out+j)[2] = (uint)in[i+2];
11356 (out+j)[3] = (uint)in[i+3];
11368 uint *out = (uint*)_out;
11372 (out+j)[0] = (uint)in[i+0];
11373 (out+j)[1] = (uint)in[i+1];
11374 (out+j)[2] = (uint)in[i+2];
11375 (out+j)[3] = (uint)in[i+3];
11376 (out+j)[4] = (uint)in[i+4];
11377 (out+j)[5] = (uint)in[i+5];
11389 uint *out = (uint*)_out;
11395 (out+j)[0] = (uint)in[i+0];
11396 (out+j)[1] = (uint)in[i+1];
11397 (out+j)[2] = (uint)in[i+2];
11398 (out+j)[3] = (uint)in[i+3];
11399 (out+j)[4] = (uint)in[i+4];
11400 (out+j)[5] = (uint)in[i+5];
11403 (out+j)[0] = (uint)in[i+2];
11404 (out+j)[1] = (uint)in[i-2];
11405 (out+j)[2] = (uint)in[i+0];
11406 (out+j)[3] = (uint)in[i+3];
11407 (out+j)[4] = (uint)in[i+4];
11408 (out+j)[5] = (uint)in[i+6];
11421 uint *out = (uint*)_out;
11425 (out+j)[0] = (uint)in[i];
11437 uint *out = (uint*)_out;
11441 (out+j)[0] = (uint)in[i];
11442 (out+j)[1] = (uint)in[i+1];
11454 uint *out = (uint*)_out;
11458 (out+j)[0] = (uint)in[i];
11459 (out+j)[1] = (uint)in[i+1];
11471 uint *out = (uint*)_out;
11475 (out+j)[0] = (uint)in[i];
11476 (out+j)[1] = (uint)in[i+1];
11478 (out+j)[0] = (uint)in[i];
11479 (out+j)[1] = (uint)in[start];
11490 uint *out = (uint*)_out;
11494 (out+j)[0] = (uint)in[i];
11495 (out+j)[1] = (uint)in[i+1];
11496 (out+j)[2] = (uint)in[i+2];
11508 uint *out = (uint*)_out;
11512 (out+j)[0] = (uint)in[i+(i&1)];
11513 (out+j)[1] = (uint)in[i+1-(i&1)];
11514 (out+j)[2] = (uint)in[i+2];
11526 uint *out = (uint*)_out;
11530 (out+j)[0] = (uint)in[start];
11531 (out+j)[1] = (uint)in[i+1];
11532 (out+j)[2] = (uint)in[i+2];
11544 uint *out = (uint*)_out;
11550 (out+j+0)[0] = restart_index;
11551 (out+j+0)[1] = restart_index;
11552 (out+j+0)[2] = restart_index;
11553 (out+j+3)[0] = restart_index;
11554 (out+j+3)[1] = restart_index;
11555 (out+j+3)[2] = restart_index;
11574 (out+j+0)[0] = (uint)in[i+0];
11575 (out+j+0)[1] = (uint)in[i+1];
11576 (out+j+0)[2] = (uint)in[i+3];
11577 (out+j+3)[0] = (uint)in[i+1];
11578 (out+j+3)[1] = (uint)in[i+2];
11579 (out+j+3)[2] = (uint)in[i+3];
11591 uint *out = (uint*)_out;
11597 (out+j+0)[0] = restart_index;
11598 (out+j+0)[1] = restart_index;
11599 (out+j+0)[2] = restart_index;
11600 (out+j+3)[0] = restart_index;
11601 (out+j+3)[1] = restart_index;
11602 (out+j+3)[2] = restart_index;
11621 (out+j+0)[0] = (uint)in[i+2];
11622 (out+j+0)[1] = (uint)in[i+0];
11623 (out+j+0)[2] = (uint)in[i+3];
11624 (out+j+3)[0] = (uint)in[i+0];
11625 (out+j+3)[1] = (uint)in[i+1];
11626 (out+j+3)[2] = (uint)in[i+3];
11638 uint *out = (uint*)_out;
11644 (out+j+0)[0] = restart_index;
11645 (out+j+0)[1] = restart_index;
11646 (out+j+0)[2] = restart_index;
11664 (out+j)[0] = (uint)in[i+1];
11665 (out+j)[1] = (uint)in[i+2];
11666 (out+j)[2] = (uint)in[start];
11678 uint *out = (uint*)_out;
11682 (out+j)[0] = (uint)in[i+0];
11683 (out+j)[1] = (uint)in[i+1];
11684 (out+j)[2] = (uint)in[i+2];
11685 (out+j)[3] = (uint)in[i+3];
11697 uint *out = (uint*)_out;
11701 (out+j)[0] = (uint)in[i+0];
11702 (out+j)[1] = (uint)in[i+1];
11703 (out+j)[2] = (uint)in[i+2];
11704 (out+j)[3] = (uint)in[i+3];
11716 uint *out = (uint*)_out;
11720 (out+j)[0] = (uint)in[i+0];
11721 (out+j)[1] = (uint)in[i+1];
11722 (out+j)[2] = (uint)in[i+2];
11723 (out+j)[3] = (uint)in[i+3];
11724 (out+j)[4] = (uint)in[i+4];
11725 (out+j)[5] = (uint)in[i+5];
11737 uint *out = (uint*)_out;
11743 (out+j)[0] = (uint)in[i+0];
11744 (out+j)[1] = (uint)in[i+1];
11745 (out+j)[2] = (uint)in[i+2];
11746 (out+j)[3] = (uint)in[i+3];
11747 (out+j)[4] = (uint)in[i+4];
11748 (out+j)[5] = (uint)in[i+5];
11751 (out+j)[0] = (uint)in[i+2];
11752 (out+j)[1] = (uint)in[i-2];
11753 (out+j)[2] = (uint)in[i+0];
11754 (out+j)[3] = (uint)in[i+3];
11755 (out+j)[4] = (uint)in[i+4];
11756 (out+j)[5] = (uint)in[i+6];
11769 ushort *out = (ushort*)_out;
11773 (out+j)[0] = (ushort)in[i];
11785 ushort *out = (ushort*)_out;
11789 (out+j)[0] = (ushort)in[i];
11790 (out+j)[1] = (ushort)in[i+1];
11802 ushort *out = (ushort*)_out;
11806 (out+j)[0] = (ushort)in[i];
11807 (out+j)[1] = (ushort)in[i+1];
11819 ushort *out = (ushort*)_out;
11823 (out+j)[0] = (ushort)in[i];
11824 (out+j)[1] = (ushort)in[i+1];
11826 (out+j)[0] = (ushort)in[i];
11827 (out+j)[1] = (ushort)in[start];
11838 ushort *out = (ushort*)_out;
11842 (out+j)[0] = (ushort)in[i];
11843 (out+j)[1] = (ushort)in[i+1];
11844 (out+j)[2] = (ushort)in[i+2];
11856 ushort *out = (ushort*)_out;
11860 (out+j)[0] = (ushort)in[i];
11861 (out+j)[1] = (ushort)in[i+1+(i&1)];
11862 (out+j)[2] = (ushort)in[i+2-(i&1)];
11874 ushort *out = (ushort*)_out;
11878 (out+j)[0] = (ushort)in[start];
11879 (out+j)[1] = (ushort)in[i+1];
11880 (out+j)[2] = (ushort)in[i+2];
11892 ushort *out = (ushort*)_out;
11896 (out+j+0)[0] = (ushort)in[i+0];
11897 (out+j+0)[1] = (ushort)in[i+1];
11898 (out+j+0)[2] = (ushort)in[i+2];
11899 (out+j+3)[0] = (ushort)in[i+0];
11900 (out+j+3)[1] = (ushort)in[i+2];
11901 (out+j+3)[2] = (ushort)in[i+3];
11913 ushort *out = (ushort*)_out;
11917 (out+j+0)[0] = (ushort)in[i+0];
11918 (out+j+0)[1] = (ushort)in[i+1];
11919 (out+j+0)[2] = (ushort)in[i+3];
11920 (out+j+3)[0] = (ushort)in[i+0];
11921 (out+j+3)[1] = (ushort)in[i+3];
11922 (out+j+3)[2] = (ushort)in[i+2];
11934 ushort *out = (ushort*)_out;
11938 (out+j)[0] = (ushort)in[start];
11939 (out+j)[1] = (ushort)in[i+1];
11940 (out+j)[2] = (ushort)in[i+2];
11952 ushort *out = (ushort*)_out;
11956 (out+j)[0] = (ushort)in[i+0];
11957 (out+j)[1] = (ushort)in[i+1];
11958 (out+j)[2] = (ushort)in[i+2];
11959 (out+j)[3] = (ushort)in[i+3];
11971 ushort *out = (ushort*)_out;
11975 (out+j)[0] = (ushort)in[i+0];
11976 (out+j)[1] = (ushort)in[i+1];
11977 (out+j)[2] = (ushort)in[i+2];
11978 (out+j)[3] = (ushort)in[i+3];
11990 ushort *out = (ushort*)_out;
11994 (out+j)[0] = (ushort)in[i+0];
11995 (out+j)[1] = (ushort)in[i+1];
11996 (out+j)[2] = (ushort)in[i+2];
11997 (out+j)[3] = (ushort)in[i+3];
11998 (out+j)[4] = (ushort)in[i+4];
11999 (out+j)[5] = (ushort)in[i+5];
12011 ushort *out = (ushort*)_out;
12017 (out+j)[0] = (ushort)in[i+0];
12018 (out+j)[1] = (ushort)in[i+1];
12019 (out+j)[2] = (ushort)in[i+2];
12020 (out+j)[3] = (ushort)in[i+3];
12021 (out+j)[4] = (ushort)in[i+4];
12022 (out+j)[5] = (ushort)in[i+5];
12025 (out+j)[0] = (ushort)in[i+2];
12026 (out+j)[1] = (ushort)in[i-2];
12027 (out+j)[2] = (ushort)in[i+0];
12028 (out+j)[3] = (ushort)in[i+3];
12029 (out+j)[4] = (ushort)in[i+4];
12030 (out+j)[5] = (ushort)in[i+6];
12043 ushort *out = (ushort*)_out;
12047 (out+j)[0] = (ushort)in[i];
12059 ushort *out = (ushort*)_out;
12063 (out+j)[0] = (ushort)in[i];
12064 (out+j)[1] = (ushort)in[i+1];
12076 ushort *out = (ushort*)_out;
12080 (out+j)[0] = (ushort)in[i];
12081 (out+j)[1] = (ushort)in[i+1];
12093 ushort *out = (ushort*)_out;
12097 (out+j)[0] = (ushort)in[i];
12098 (out+j)[1] = (ushort)in[i+1];
12100 (out+j)[0] = (ushort)in[i];
12101 (out+j)[1] = (ushort)in[start];
12112 ushort *out = (ushort*)_out;
12116 (out+j)[0] = (ushort)in[i];
12117 (out+j)[1] = (ushort)in[i+1];
12118 (out+j)[2] = (ushort)in[i+2];
12130 ushort *out = (ushort*)_out;
12134 (out+j)[0] = (ushort)in[i];
12135 (out+j)[1] = (ushort)in[i+1+(i&1)];
12136 (out+j)[2] = (ushort)in[i+2-(i&1)];
12148 ushort *out = (ushort*)_out;
12152 (out+j)[0] = (ushort)in[start];
12153 (out+j)[1] = (ushort)in[i+1];
12154 (out+j)[2] = (ushort)in[i+2];
12166 ushort *out = (ushort*)_out;
12172 (out+j+0)[0] = restart_index;
12173 (out+j+0)[1] = restart_index;
12174 (out+j+0)[2] = restart_index;
12175 (out+j+3)[0] = restart_index;
12176 (out+j+3)[1] = restart_index;
12177 (out+j+3)[2] = restart_index;
12196 (out+j+0)[0] = (ushort)in[i+0];
12197 (out+j+0)[1] = (ushort)in[i+1];
12198 (out+j+0)[2] = (ushort)in[i+2];
12199 (out+j+3)[0] = (ushort)in[i+0];
12200 (out+j+3)[1] = (ushort)in[i+2];
12201 (out+j+3)[2] = (ushort)in[i+3];
12213 ushort *out = (ushort*)_out;
12219 (out+j+0)[0] = restart_index;
12220 (out+j+0)[1] = restart_index;
12221 (out+j+0)[2] = restart_index;
12222 (out+j+3)[0] = restart_index;
12223 (out+j+3)[1] = restart_index;
12224 (out+j+3)[2] = restart_index;
12243 (out+j+0)[0] = (ushort)in[i+0];
12244 (out+j+0)[1] = (ushort)in[i+1];
12245 (out+j+0)[2] = (ushort)in[i+3];
12246 (out+j+3)[0] = (ushort)in[i+0];
12247 (out+j+3)[1] = (ushort)in[i+3];
12248 (out+j+3)[2] = (ushort)in[i+2];
12260 ushort *out = (ushort*)_out;
12266 (out+j+0)[0] = restart_index;
12267 (out+j+0)[1] = restart_index;
12268 (out+j+0)[2] = restart_index;
12286 (out+j)[0] = (ushort)in[start];
12287 (out+j)[1] = (ushort)in[i+1];
12288 (out+j)[2] = (ushort)in[i+2];
12300 ushort *out = (ushort*)_out;
12304 (out+j)[0] = (ushort)in[i+0];
12305 (out+j)[1] = (ushort)in[i+1];
12306 (out+j)[2] = (ushort)in[i+2];
12307 (out+j)[3] = (ushort)in[i+3];
12319 ushort *out = (ushort*)_out;
12323 (out+j)[0] = (ushort)in[i+0];
12324 (out+j)[1] = (ushort)in[i+1];
12325 (out+j)[2] = (ushort)in[i+2];
12326 (out+j)[3] = (ushort)in[i+3];
12338 ushort *out = (ushort*)_out;
12342 (out+j)[0] = (ushort)in[i+0];
12343 (out+j)[1] = (ushort)in[i+1];
12344 (out+j)[2] = (ushort)in[i+2];
12345 (out+j)[3] = (ushort)in[i+3];
12346 (out+j)[4] = (ushort)in[i+4];
12347 (out+j)[5] = (ushort)in[i+5];
12359 ushort *out = (ushort*)_out;
12365 (out+j)[0] = (ushort)in[i+0];
12366 (out+j)[1] = (ushort)in[i+1];
12367 (out+j)[2] = (ushort)in[i+2];
12368 (out+j)[3] = (ushort)in[i+3];
12369 (out+j)[4] = (ushort)in[i+4];
12370 (out+j)[5] = (ushort)in[i+5];
12373 (out+j)[0] = (ushort)in[i+2];
12374 (out+j)[1] = (ushort)in[i-2];
12375 (out+j)[2] = (ushort)in[i+0];
12376 (out+j)[3] = (ushort)in[i+3];
12377 (out+j)[4] = (ushort)in[i+4];
12378 (out+j)[5] = (ushort)in[i+6];
12391 ushort *out = (ushort*)_out;
12395 (out+j)[0] = (ushort)in[i];
12407 ushort *out = (ushort*)_out;
12411 (out+j)[0] = (ushort)in[i+1];
12412 (out+j)[1] = (ushort)in[i];
12424 ushort *out = (ushort*)_out;
12428 (out+j)[0] = (ushort)in[i+1];
12429 (out+j)[1] = (ushort)in[i];
12441 ushort *out = (ushort*)_out;
12445 (out+j)[0] = (ushort)in[i+1];
12446 (out+j)[1] = (ushort)in[i];
12448 (out+j)[0] = (ushort)in[start];
12449 (out+j)[1] = (ushort)in[i];
12460 ushort *out = (ushort*)_out;
12464 (out+j)[0] = (ushort)in[i+1];
12465 (out+j)[1] = (ushort)in[i+2];
12466 (out+j)[2] = (ushort)in[i];
12478 ushort *out = (ushort*)_out;
12482 (out+j)[0] = (ushort)in[i+1+(i&1)];
12483 (out+j)[1] = (ushort)in[i+2-(i&1)];
12484 (out+j)[2] = (ushort)in[i];
12496 ushort *out = (ushort*)_out;
12500 (out+j)[0] = (ushort)in[i+1];
12501 (out+j)[1] = (ushort)in[i+2];
12502 (out+j)[2] = (ushort)in[start];
12514 ushort *out = (ushort*)_out;
12518 (out+j+0)[0] = (ushort)in[i+1];
12519 (out+j+0)[1] = (ushort)in[i+2];
12520 (out+j+0)[2] = (ushort)in[i+0];
12521 (out+j+3)[0] = (ushort)in[i+2];
12522 (out+j+3)[1] = (ushort)in[i+3];
12523 (out+j+3)[2] = (ushort)in[i+0];
12535 ushort *out = (ushort*)_out;
12539 (out+j+0)[0] = (ushort)in[i+1];
12540 (out+j+0)[1] = (ushort)in[i+3];
12541 (out+j+0)[2] = (ushort)in[i+0];
12542 (out+j+3)[0] = (ushort)in[i+3];
12543 (out+j+3)[1] = (ushort)in[i+2];
12544 (out+j+3)[2] = (ushort)in[i+0];
12556 ushort *out = (ushort*)_out;
12560 (out+j)[0] = (ushort)in[i+1];
12561 (out+j)[1] = (ushort)in[i+2];
12562 (out+j)[2] = (ushort)in[start];
12574 ushort *out = (ushort*)_out;
12578 (out+j)[0] = (ushort)in[i+3];
12579 (out+j)[1] = (ushort)in[i+2];
12580 (out+j)[2] = (ushort)in[i+1];
12581 (out+j)[3] = (ushort)in[i+0];
12593 ushort *out = (ushort*)_out;
12597 (out+j)[0] = (ushort)in[i+3];
12598 (out+j)[1] = (ushort)in[i+2];
12599 (out+j)[2] = (ushort)in[i+1];
12600 (out+j)[3] = (ushort)in[i+0];
12612 ushort *out = (ushort*)_out;
12616 (out+j)[0] = (ushort)in[i+4];
12617 (out+j)[1] = (ushort)in[i+5];
12618 (out+j)[2] = (ushort)in[i+0];
12619 (out+j)[3] = (ushort)in[i+1];
12620 (out+j)[4] = (ushort)in[i+2];
12621 (out+j)[5] = (ushort)in[i+3];
12633 ushort *out = (ushort*)_out;
12639 (out+j)[0] = (ushort)in[i+4];
12640 (out+j)[1] = (ushort)in[i+5];
12641 (out+j)[2] = (ushort)in[i+0];
12642 (out+j)[3] = (ushort)in[i+1];
12643 (out+j)[4] = (ushort)in[i+2];
12644 (out+j)[5] = (ushort)in[i+3];
12647 (out+j)[0] = (ushort)in[i+4];
12648 (out+j)[1] = (ushort)in[i+6];
12649 (out+j)[2] = (ushort)in[i+2];
12650 (out+j)[3] = (ushort)in[i-2];
12651 (out+j)[4] = (ushort)in[i+0];
12652 (out+j)[5] = (ushort)in[i+3];
12665 ushort *out = (ushort*)_out;
12669 (out+j)[0] = (ushort)in[i];
12681 ushort *out = (ushort*)_out;
12685 (out+j)[0] = (ushort)in[i+1];
12686 (out+j)[1] = (ushort)in[i];
12698 ushort *out = (ushort*)_out;
12702 (out+j)[0] = (ushort)in[i+1];
12703 (out+j)[1] = (ushort)in[i];
12715 ushort *out = (ushort*)_out;
12719 (out+j)[0] = (ushort)in[i+1];
12720 (out+j)[1] = (ushort)in[i];
12722 (out+j)[0] = (ushort)in[start];
12723 (out+j)[1] = (ushort)in[i];
12734 ushort *out = (ushort*)_out;
12738 (out+j)[0] = (ushort)in[i+1];
12739 (out+j)[1] = (ushort)in[i+2];
12740 (out+j)[2] = (ushort)in[i];
12752 ushort *out = (ushort*)_out;
12756 (out+j)[0] = (ushort)in[i+1+(i&1)];
12757 (out+j)[1] = (ushort)in[i+2-(i&1)];
12758 (out+j)[2] = (ushort)in[i];
12770 ushort *out = (ushort*)_out;
12774 (out+j)[0] = (ushort)in[i+1];
12775 (out+j)[1] = (ushort)in[i+2];
12776 (out+j)[2] = (ushort)in[start];
12788 ushort *out = (ushort*)_out;
12794 (out+j+0)[0] = restart_index;
12795 (out+j+0)[1] = restart_index;
12796 (out+j+0)[2] = restart_index;
12797 (out+j+3)[0] = restart_index;
12798 (out+j+3)[1] = restart_index;
12799 (out+j+3)[2] = restart_index;
12818 (out+j+0)[0] = (ushort)in[i+1];
12819 (out+j+0)[1] = (ushort)in[i+2];
12820 (out+j+0)[2] = (ushort)in[i+0];
12821 (out+j+3)[0] = (ushort)in[i+2];
12822 (out+j+3)[1] = (ushort)in[i+3];
12823 (out+j+3)[2] = (ushort)in[i+0];
12835 ushort *out = (ushort*)_out;
12841 (out+j+0)[0] = restart_index;
12842 (out+j+0)[1] = restart_index;
12843 (out+j+0)[2] = restart_index;
12844 (out+j+3)[0] = restart_index;
12845 (out+j+3)[1] = restart_index;
12846 (out+j+3)[2] = restart_index;
12865 (out+j+0)[0] = (ushort)in[i+1];
12866 (out+j+0)[1] = (ushort)in[i+3];
12867 (out+j+0)[2] = (ushort)in[i+0];
12868 (out+j+3)[0] = (ushort)in[i+3];
12869 (out+j+3)[1] = (ushort)in[i+2];
12870 (out+j+3)[2] = (ushort)in[i+0];
12882 ushort *out = (ushort*)_out;
12888 (out+j+0)[0] = restart_index;
12889 (out+j+0)[1] = restart_index;
12890 (out
12908 (out+j)[0] = (ushort)in[i+1];
12909 (out+j)[1] = (ushort)in[i+2];
12910 (out+j)[2] = (ushort)in[start];
12922 ushort *out = (ushort*)_out;
12926 (out+j)[0] = (ushort)in[i+3];
12927 (out+j)[1] = (ushort)in[i+2];
12928 (out+j)[2] = (ushort)in[i+1];
12929 (out+j)[3] = (ushort)in[i+0];
12941 ushort *out = (ushort*)_out;
12945 (out+j)[0] = (ushort)in[i+3];
12946 (out+j)[1] = (ushort)in[i+2];
12947 (out+j)[2] = (ushort)in[i+1];
12948 (out+j)[3] = (ushort)in[i+0];
12960 ushort *out = (ushort*)_out;
12964 (out+j)[0] = (ushort)in[i+4];
12965 (out+j)[1] = (ushort)in[i+5];
12966 (out+j)[2] = (ushort)in[i+0];
12967 (out+j)[3] = (ushort)in[i+1];
12968 (out+j)[4] = (ushort)in[i+2];
12969 (out+j)[5] = (ushort)in[i+3];
12981 ushort *out = (ushort*)_out;
12987 (out+j)[0] = (ushort)in[i+4];
12988 (out+j)[1] = (ushort)in[i+5];
12989 (out+j)[2] = (ushort)in[i+0];
12990 (out+j)[3] = (ushort)in[i+1];
12991 (out+j)[4] = (ushort)in[i+2];
12992 (out+j)[5] = (ushort)in[i+3];
12995 (out+j)[0] = (ushort)in[i+4];
12996 (out+j)[1] = (ushort)in[i+6];
12997 (out+j)[2] = (ushort)in[i+2];
12998 (out+j)[3] = (ushort)in[i-2];
12999 (out+j)[4] = (ushort)in[i+0];
13000 (out+j)[5] = (ushort)in[i+3];
13013 ushort *out = (ushort*)_out;
13017 (out+j)[0] = (ushort)in[i];
13029 ushort *out = (ushort*)_out;
13033 (out+j)[0] = (ushort)in[i+1];
13034 (out+j)[1] = (ushort)in[i];
13046 ushort *out = (ushort*)_out;
13050 (out+j)[0] = (ushort)in[i+1];
13051 (out+j)[1] = (ushort)in[i];
13063 ushort *out = (ushort*)_out;
13067 (out+j)[0] = (ushort)in[i+1];
13068 (out+j)[1] = (ushort)in[i];
13070 (out+j)[0] = (ushort)in[start];
13071 (out+j)[1] = (ushort)in[i];
13082 ushort *out = (ushort*)_out;
13086 (out+j)[0] = (ushort)in[i+2];
13087 (out+j)[1] = (ushort)in[i];
13088 (out+j)[2] = (ushort)in[i+1];
13100 ushort *out = (ushort*)_out;
13104 (out+j)[0] = (ushort)in[i+2];
13105 (out+j)[1] = (ushort)in[i+(i&1)];
13106 (out+j)[2] = (ushort)in[i+1-(i&1)];
13118 ushort *out = (ushort*)_out;
13122 (out+j)[0] = (ushort)in[i+2];
13123 (out+j)[1] = (ushort)in[start];
13124 (out+j)[2] = (ushort)in[i+1];
13136 ushort *out = (ushort*)_out;
13140 (out+j+0)[0] = (ushort)in[i+3];
13141 (out+j+0)[1] = (ushort)in[i+0];
13142 (out+j+0)[2] = (ushort)in[i+1];
13143 (out+j+3)[0] = (ushort)in[i+3];
13144 (out+j+3)[1] = (ushort)in[i+1];
13145 (out+j+3)[2] = (ushort)in[i+2];
13157 ushort *out = (ushort*)_out;
13161 (out+j+0)[0] = (ushort)in[i+3];
13162 (out+j+0)[1] = (ushort)in[i+2];
13163 (out+j+0)[2] = (ushort)in[i+0];
13164 (out+j+3)[0] = (ushort)in[i+3];
13165 (out+j+3)[1] = (ushort)in[i+0];
13166 (out+j+3)[2] = (ushort)in[i+1];
13178 ushort *out = (ushort*)_out;
13182 (out+j)[0] = (ushort)in[start];
13183 (out+j)[1] = (ushort)in[i+1];
13184 (out+j)[2] = (ushort)in[i+2];
13196 ushort *out = (ushort*)_out;
13200 (out+j)[0] = (ushort)in[i+3];
13201 (out+j)[1] = (ushort)in[i+2];
13202 (out+j)[2] = (ushort)in[i+1];
13203 (out+j)[3] = (ushort)in[i+0];
13215 ushort *out = (ushort*)_out;
13219 (out+j)[0] = (ushort)in[i+3];
13220 (out+j)[1] = (ushort)in[i+2];
13221 (out+j)[2] = (ushort)in[i+1];
13222 (out+j)[3] = (ushort)in[i+0];
13234 ushort *out = (ushort*)_out;
13238 (out+j)[0] = (ushort)in[i+4];
13239 (out+j)[1] = (ushort)in[i+5];
13240 (out+j)[2] = (ushort)in[i+0];
13241 (out+j)[3] = (ushort)in[i+1];
13242 (out+j)[4] = (ushort)in[i+2];
13243 (out+j)[5] = (ushort)in[i+3];
13255 ushort *out = (ushort*)_out;
13261 (out+j)[0] = (ushort)in[i+4];
13262 (out+j)[1] = (ushort)in[i+5];
13263 (out+j)[2] = (ushort)in[i+0];
13264 (out+j)[3] = (ushort)in[i+1];
13265 (out+j)[4] = (ushort)in[i+2];
13266 (out+j)[5] = (ushort)in[i+3];
13269 (out+j)[0] = (ushort)in[i+4];
13270 (out+j)[1] = (ushort)in[i+6];
13271 (out+j)[2] = (ushort)in[i+2];
13272 (out+j)[3] = (ushort)in[i-2];
13273 (out+j)[4] = (ushort)in[i+0];
13274 (out+j)[5] = (ushort)in[i+3];
13287 ushort *out = (ushort*)_out;
13291 (out+j)[0] = (ushort)in[i];
13303 ushort *out = (ushort*)_out;
13307 (out+j)[0] = (ushort)in[i+1];
13308 (out+j)[1] = (ushort)in[i];
13320 ushort *out = (ushort*)_out;
13324 (out+j)[0] = (ushort)in[i+1];
13325 (out+j)[1] = (ushort)in[i];
13337 ushort *out = (ushort*)_out;
13341 (out+j)[0] = (ushort)in[i+1];
13342 (out+j)[1] = (ushort)in[i];
13344 (out+j)[0] = (ushort)in[start];
13345 (out+j)[1] = (ushort)in[i];
13356 ushort *out = (ushort*)_out;
13360 (out+j)[0] = (ushort)in[i+2];
13361 (out+j)[1] = (ushort)in[i];
13362 (out+j)[2] = (ushort)in[i+1];
13374 ushort *out = (ushort*)_out;
13378 (out+j)[0] = (ushort)in[i+2];
13379 (out+j)[1] = (ushort)in[i+(i&1)];
13380 (out+j)[2] = (ushort)in[i+1-(i&1)];
13392 ushort *out = (ushort*)_out;
13396 (out+j)[0] = (ushort)in[i+2];
13397 (out+j)[1] = (ushort)in[start];
13398 (out+j)[2] = (ushort)in[i+1];
13410 ushort *out = (ushort*)_out;
13416 (out+j+0)[0] = restart_index;
13417 (out+j+0)[1] = restart_index;
13418 (out+j+0)[2] = restart_index;
13419 (out+j+3)[0] = restart_index;
13420 (out+j+3)[1] = restart_index;
13421 (out+j+3)[2] = restart_index;
13440 (out+j+0)[0] = (ushort)in[i+3];
13441 (out+j+0)[1] = (ushort)in[i+0];
13442 (out+j+0)[2] = (ushort)in[i+1];
13443 (out+j+3)[0] = (ushort)in[i+3];
13444 (out+j+3)[1] = (ushort)in[i+1];
13445 (out+j+3)[2] = (ushort)in[i+2];
13457 ushort *out = (ushort*)_out;
13463 (out+j+0)[0] = restart_index;
13464 (out+j+0)[1] = restart_index;
13465 (out+j+0)[2] = restart_index;
13466 (out+j+3)[0] = restart_index;
13467 (out+j+3)[1] = restart_index;
13468 (out+j+3)[2] = restart_index;
13487 (out+j+0)[0] = (ushort)in[i+3];
13488 (out+j+0)[1] = (ushort)in[i+2];
13489 (out+j+0)[2] = (ushort)in[i+0];
13490 (out+j+3)[0] = (ushort)in[i+3];
13491 (out+j+3)[1] = (ushort)in[i+0];
13492 (out+j+3)[2] = (ushort)in[i+1];
13504 ushort *out = (ushort*)_out;
13510 (out+j+0)[0] = restart_index;
13511 (out+j+0)[1] = restart_index;
13512 (out+j+0)[2] = restart_index;
13530 (out+j)[0] = (ushort)in[start];
13531 (out+j)[1] = (ushort)in[i+1];
13532 (out+j)[2] = (ushort)in[i+2];
13544 ushort *out = (ushort*)_out;
13548 (out+j)[0] = (ushort)in[i+3];
13549 (out+j)[1] = (ushort)in[i+2];
13550 (out+j)[2] = (ushort)in[i+1];
13551 (out+j)[3] = (ushort)in[i+0];
13563 ushort *out = (ushort*)_out;
13567 (out+j)[0] = (ushort)in[i+3];
13568 (out+j)[1] = (ushort)in[i+2];
13569 (out+j)[2] = (ushort)in[i+1];
13570 (out+j)[3] = (ushort)in[i+0];
13582 ushort *out = (ushort*)_out;
13586 (out+j)[0] = (ushort)in[i+4];
13587 (out+j)[1] = (ushort)in[i+5];
13588 (out+j)[2] = (ushort)in[i+0];
13589 (out+j)[3] = (ushort)in[i+1];
13590 (out+j)[4] = (ushort)in[i+2];
13591 (out+j)[5] = (ushort)in[i+3];
13603 ushort *out = (ushort*)_out;
13609 (out+j)[0] = (ushort)in[i+4];
13610 (out+j)[1] = (ushort)in[i+5];
13611 (out+j)[2] = (ushort)in[i+0];
13612 (out+j)[3] = (ushort)in[i+1];
13613 (out+j)[4] = (ushort)in[i+2];
13614 (out+j)[5] = (ushort)in[i+3];
13617 (out+j)[0] = (ushort)in[i+4];
13618 (out+j)[1] = (ushort)in[i+6];
13619 (out+j)[2] = (ushort)in[i+2];
13620 (out+j)[3] = (ushort)in[i-2];
13621 (out+j)[4] = (ushort)in[i+0];
13622 (out+j)[5] = (ushort)in[i+3];
13635 ushort *out = (ushort*)_out;
13639 (out+j)[0] = (ushort)in[i];
13651 ushort *out = (ushort*)_out;
13655 (out+j)[0] = (ushort)in[i];
13656 (out+j)[1] = (ushort)in[i+1];
13668 ushort *out = (ushort*)_out;
13672 (out+j)[0] = (ushort)in[i];
13673 (out+j)[1] = (ushort)in[i+1];
13685 ushort *out = (ushort*)_out;
13689 (out+j)[0] = (ushort)in[i];
13690 (out+j)[1] = (ushort)in[i+1];
13692 (out+j)[0] = (ushort)in[i];
13693 (out+j)[1] = (ushort)in[start];
13704 ushort *out = (ushort*)_out;
13708 (out+j)[0] = (ushort)in[i];
13709 (out+j)[1] = (ushort)in[i+1];
13710 (out+j)[2] = (ushort)in[i+2];
13722 ushort *out = (ushort*)_out;
13726 (out+j)[0] = (ushort)in[i+(i&1)];
13727 (out+j)[1] = (ushort)in[i+1-(i&1)];
13728 (out+j)[2] = (ushort)in[i+2];
13740 ushort *out = (ushort*)_out;
13744 (out+j)[0] = (ushort)in[start];
13745 (out+j)[1] = (ushort)in[i+1];
13746 (out+j)[2] = (ushort)in[i+2];
13758 ushort *out = (ushort*)_out;
13762 (out+j+0)[0] = (ushort)in[i+0];
13763 (out+j+0)[1] = (ushort)in[i+1];
13764 (out+j+0)[2] = (ushort)in[i+3];
13765 (out+j+3)[0] = (ushort)in[i+1];
13766 (out+j+3)[1] = (ushort)in[i+2];
13767 (out+j+3)[2] = (ushort)in[i+3];
13779 ushort *out = (ushort*)_out;
13783 (out+j+0)[0] = (ushort)in[i+2];
13784 (out+j+0)[1] = (ushort)in[i+0];
13785 (out+j+0)[2] = (ushort)in[i+3];
13786 (out+j+3)[0] = (ushort)in[i+0];
13787 (out+j+3)[1] = (ushort)in[i+1];
13788 (out+j+3)[2] = (ushort)in[i+3];
13800 ushort *out = (ushort*)_out;
13804 (out+j)[0] = (ushort)in[i+1];
13805 (out+j)[1] = (ushort)in[i+2];
13806 (out+j)[2] = (ushort)in[start];
13818 ushort *out = (ushort*)_out;
13822 (out+j)[0] = (ushort)in[i+0];
13823 (out+j)[1] = (ushort)in[i+1];
13824 (out+j)[2] = (ushort)in[i+2];
13825 (out+j)[3] = (ushort)in[i+3];
13837 ushort *out = (ushort*)_out;
13841 (out+j)[0] = (ushort)in[i+0];
13842 (out+j)[1] = (ushort)in[i+1];
13843 (out+j)[2] = (ushort)in[i+2];
13844 (out+j)[3] = (ushort)in[i+3];
13856 ushort *out = (ushort*)_out;
13860 (out+j)[0] = (ushort)in[i+0];
13861 (out+j)[1] = (ushort)in[i+1];
13862 (out+j)[2] = (ushort)in[i+2];
13863 (out+j)[3] = (ushort)in[i+3];
13864 (out+j)[4] = (ushort)in[i+4];
13865 (out+j)[5] = (ushort)in[i+5];
13877 ushort *out = (ushort*)_out;
13883 (out+j)[0] = (ushort)in[i+0];
13884 (out+j)[1] = (ushort)in[i+1];
13885 (out+j)[2] = (ushort)in[i+2];
13886 (out+j)[3] = (ushort)in[i+3];
13887 (out+j)[4] = (ushort)in[i+4];
13888 (out+j)[5] = (ushort)in[i+5];
13891 (out+j)[0] = (ushort)in[i+2];
13892 (out+j)[1] = (ushort)in[i-2];
13893 (out+j)[2] = (ushort)in[i+0];
13894 (out+j)[3] = (ushort)in[i+3];
13895 (out+j)[4] = (ushort)in[i+4];
13896 (out+j)[5] = (ushort)in[i+6];
13909 ushort *out = (ushort*)_out;
13913 (out+j)[0] = (ushort)in[i];
13925 ushort *out = (ushort*)_out;
13929 (out+j)[0] = (ushort)in[i];
13930 (out+j)[1] = (ushort)in[i+1];
13942 ushort *out = (ushort*)_out;
13946 (out+j)[0] = (ushort)in[i];
13947 (out+j)[1] = (ushort)in[i+1];
13959 ushort *out = (ushort*)_out;
13963 (out+j)[0] = (ushort)in[i];
13964 (out+j)[1] = (ushort)in[i+1];
13966 (out+j)[0] = (ushort)in[i];
13967 (out+j)[1] = (ushort)in[start];
13978 ushort *out = (ushort*)_out;
13982 (out+j)[0] = (ushort)in[i];
13983 (out+j)[1] = (ushort)in[i+1];
13984 (out+j)[2] = (ushort)in[i+2];
13996 ushort *out = (ushort*)_out;
14000 (out+j)[0] = (ushort)in[i+(i&1)];
14001 (out+j)[1] = (ushort)in[i+1-(i&1)];
14002 (out+j)[2] = (ushort)in[i+2];
14014 ushort *out = (ushort*)_out;
14018 (out+j)[0] = (ushort)in[start];
14019 (out+j)[1] = (ushort)in[i+1];
14020 (out+j)[2] = (ushort)in[i+2];
14032 ushort *out = (ushort*)_out;
14038 (out+j+0)[0] = restart_index;
14039 (out+j+0)[1] = restart_index;
14040 (out+j+0)[2] = restart_index;
14041 (out+j+3)[0] = restart_index;
14042 (out+j+3)[1] = restart_index;
14043 (out+j+3)[2] = restart_index;
14062 (out+j+0)[0] = (ushort)in[i+0];
14063 (out+j+0)[1] = (ushort)in[i+1];
14064 (out+j+0)[2] = (ushort)in[i+3];
14065 (out+j+3)[0] = (ushort)in[i+1];
14066 (out+j+3)[1] = (ushort)in[i+2];
14067 (out+j+3)[2] = (ushort)in[i+3];
14079 ushort *out = (ushort*)_out;
14085 (out+j+0)[0] = restart_index;
14086 (out+j+0)[1] = restart_index;
14087 (out+j+0)[2] = restart_index;
14088 (out+j+3)[0] = restart_index;
14089 (out+j+3)[1] = restart_index;
14090 (out+j+3)[2] = restart_index;
14109 (out+j+0)[0] = (ushort)in[i+2];
14110 (out+j+0)[1] = (ushort)in[i+0];
14111 (out+j+0)[2] = (ushort)in[i+3];
14112 (out+j+3)[0] = (ushort)in[i+0];
14113 (out+j+3)[1] = (ushort)in[i+1];
14114 (out+j+3)[2] = (ushort)in[i+3];
14126 ushort *out = (ushort*)_out;
14132 (out+j+0)[0] = restart_index;
14133 (out+j+0)[1] = restart_index;
14134 (out+j+0)[2] = restart_index;
14152 (out+j)[0] = (ushort)in[i+1];
14153 (out+j)[1] = (ushort)in[i+2];
14154 (out+j)[2] = (ushort)in[start];
14166 ushort *out = (ushort*)_out;
14170 (out+j)[0] = (ushort)in[i+0];
14171 (out+j)[1] = (ushort)in[i+1];
14172 (out+j)[2] = (ushort)in[i+2];
14173 (out+j)[3] = (ushort)in[i+3];
14185 ushort *out = (ushort*)_out;
14189 (out+j)[0] = (ushort)in[i+0];
14190 (out+j)[1] = (ushort)in[i+1];
14191 (out+j)[2] = (ushort)in[i+2];
14192 (out+j)[3] = (ushort)in[i+3];
14204 ushort *out = (ushort*)_out;
14208 (out+j)[0] = (ushort)in[i+0];
14209 (out+j)[1] = (ushort)in[i+1];
14210 (out+j)[2] = (ushort)in[i+2];
14211 (out+j)[3] = (ushort)in[i+3];
14212 (out+j)[4] = (ushort)in[i+4];
14213 (out+j)[5] = (ushort)in[i+5];
14225 ushort *out = (ushort*)_out;
14231 (out+j)[0] = (ushort)in[i+0];
14232 (out+j)[1] = (ushort)in[i+1];
14233 (out+j)[2] = (ushort)in[i+2];
14234 (out+j)[3] = (ushort)in[i+3];
14235 (out+j)[4] = (ushort)in[i+4];
14236 (out+j)[5] = (ushort)in[i+5];
14239 (out+j)[0] = (ushort)in[i+2];
14240 (out+j)[1] = (ushort)in[i-2];
14241 (out+j)[2] = (ushort)in[i+0];
14242 (out+j)[3] = (ushort)in[i+3];
14243 (out+j)[4] = (ushort)in[i+4];
14244 (out+j)[5] = (ushort)in[i+6];
14257 uint *out = (uint*)_out;
14261 (out+j)[0] = (uint)in[i];
14273 uint *out = (uint*)_out;
14277 (out+j)[0] = (uint)in[i];
14278 (out+j)[1] = (uint)in[i+1];
14290 uint *out = (uint*)_out;
14294 (out+j)[0] = (uint)in[i];
14295 (out+j)[1] = (uint)in[i+1];
14307 uint *out = (uint*)_out;
14311 (out+j)[0] = (uint)in[i];
14312 (out+j)[1] = (uint)in[i+1];
14314 (out+j)[0] = (uint)in[i];
14315 (out+j)[1] = (uint)in[start];
14326 uint *out = (uint*)_out;
14330 (out+j)[0] = (uint)in[i];
14331 (out+j)[1] = (uint)in[i+1];
14332 (out+j)[2] = (uint)in[i+2];
14344 uint *out = (uint*)_out;
14348 (out+j)[0] = (uint)in[i];
14349 (out+j)[1] = (uint)in[i+1+(i&1)];
14350 (out+j)[2] = (uint)in[i+2-(i&1)];
14362 uint *out = (uint*)_out;
14366 (out+j)[0] = (uint)in[start];
14367 (out+j)[1] = (uint)in[i+1];
14368 (out+j)[2] = (uint)in[i+2];
14380 uint *out = (uint*)_out;
14384 (out+j+0)[0] = (uint)in[i+0];
14385 (out+j+0)[1] = (uint)in[i+1];
14386 (out+j+0)[2] = (uint)in[i+2];
14387 (out+j+3)[0] = (uint)in[i+0];
14388 (out+j+3)[1] = (uint)in[i+2];
14389 (out+j+3)[2] = (uint)in[i+3];
14401 uint *out = (uint*)_out;
14405 (out+j+0)[0] = (uint)in[i+0];
14406 (out+j+0)[1] = (uint)in[i+1];
14407 (out+j+0)[2] = (uint)in[i+3];
14408 (out+j+3)[0] = (uint)in[i+0];
14409 (out+j+3)[1] = (uint)in[i+3];
14410 (out+j+3)[2] = (uint)in[i+2];
14422 uint *out = (uint*)_out;
14426 (out+j)[0] = (uint)in[start];
14427 (out+j)[1] = (uint)in[i+1];
14428 (out+j)[2] = (uint)in[i+2];
14440 uint *out = (uint*)_out;
14444 (out+j)[0] = (uint)in[i+0];
14445 (out+j)[1] = (uint)in[i+1];
14446 (out+j)[2] = (uint)in[i+2];
14447 (out+j)[3] = (uint)in[i+3];
14459 uint *out = (uint*)_out;
14463 (out+j)[0] = (uint)in[i+0];
14464 (out+j)[1] = (uint)in[i+1];
14465 (out+j)[2] = (uint)in[i+2];
14466 (out+j)[3] = (uint)in[i+3];
14478 uint *out = (uint*)_out;
14482 (out+j)[0] = (uint)in[i+0];
14483 (out+j)[1] = (uint)in[i+1];
14484 (out+j)[2] = (uint)in[i+2];
14485 (out+j)[3] = (uint)in[i+3];
14486 (out+j)[4] = (uint)in[i+4];
14487 (out+j)[5] = (uint)in[i+5];
14499 uint *out = (uint*)_out;
14505 (out+j)[0] = (uint)in[i+0];
14506 (out+j)[1] = (uint)in[i+1];
14507 (out+j)[2] = (uint)in[i+2];
14508 (out+j)[3] = (uint)in[i+3];
14509 (out+j)[4] = (uint)in[i+4];
14510 (out+j)[5] = (uint)in[i+5];
14513 (out+j)[0] = (uint)in[i+2];
14514 (out+j)[1] = (uint)in[i-2];
14515 (out+j)[2] = (uint)in[i+0];
14516 (out+j)[3] = (uint)in[i+3];
14517 (out+j)[4] = (uint)in[i+4];
14518 (out+j)[5] = (uint)in[i+6];
14531 uint *out = (uint*)_out;
14535 (out+j)[0] = (uint)in[i];
14547 uint *out = (uint*)_out;
14551 (out+j)[0] = (uint)in[i];
14552 (out+j)[1] = (uint)in[i+1];
14564 uint *out = (uint*)_out;
14568 (out+j)[0] = (uint)in[i];
14569 (out+j)[1] = (uint)in[i+1];
14581 uint *out = (uint*)_out;
14585 (out+j)[0] = (uint)in[i];
14586 (out+j)[1] = (uint)in[i+1];
14588 (out+j)[0] = (uint)in[i];
14589 (out+j)[1] = (uint)in[start];
14600 uint *out = (uint*)_out;
14604 (out+j)[0] = (uint)in[i];
14605 (out+j)[1] = (uint)in[i+1];
14606 (out+j)[2] = (uint)in[i+2];
14618 uint *out = (uint*)_out;
14622 (out+j)[0] = (uint)in[i];
14623 (out+j)[1] = (uint)in[i+1+(i&1)];
14624 (out+j)[2] = (uint)in[i+2-(i&1)];
14636 uint *out = (uint*)_out;
14640 (out+j)[0] = (uint)in[start];
14641 (out+j)[1] = (uint)in[i+1];
14642 (out+j)[2] = (uint)in[i+2];
14654 uint *out = (uint*)_out;
14660 (out+j+0)[0] = restart_index;
14661 (out+j+0)[1] = restart_index;
14662 (out+j+0)[2] = restart_index;
14663 (out+j+3)[0] = restart_index;
14664 (out+j+3)[1] = restart_index;
14665 (out+j+3)[2] = restart_index;
14684 (out+j+0)[0] = (uint)in[i+0];
14685 (out+j+0)[1] = (uint)in[i+1];
14686 (out+j+0)[2] = (uint)in[i+2];
14687 (out+j+3)[0] = (uint)in[i+0];
14688 (out+j+3)[1] = (uint)in[i+2];
14689 (out+j+3)[2] = (uint)in[i+3];
14701 uint *out = (uint*)_out;
14707 (out+j+0)[0] = restart_index;
14708 (out+j+0)[1] = restart_index;
14709 (out+j+0)[2] = restart_index;
14710 (out+j+3)[0] = restart_index;
14711 (out+j+3)[1] = restart_index;
14712 (out+j+3)[2] = restart_index;
14731 (out+j+0)[0] = (uint)in[i+0];
14732 (out+j+0)[1] = (uint)in[i+1];
14733 (out+j+0)[2] = (uint)in[i+3];
14734 (out+j+3)[0] = (uint)in[i+0];
14735 (out+j+3)[1] = (uint)in[i+3];
14736 (out+j+3)[2] = (uint)in[i+2];
14748 uint *out = (uint*)_out;
14754 (out+j+0)[0] = restart_index;
14755 (out+j+0)[1] = restart_index;
14756 (out+j+0)[2] = restart_index;
14774 (out+j)[0] = (uint)in[start];
14775 (out+j)[1] = (uint)in[i+1];
14776 (out+j)[2] = (uint)in[i+2];
14788 uint *out = (uint*)_out;
14792 (out+j)[0] = (uint)in[i+0];
14793 (out+j)[1] = (uint)in[i+1];
14794 (out+j)[2] = (uint)in[i+2];
14795 (out+j)[3] = (uint)in[i+3];
14807 uint *out = (uint*)_out;
14811 (out+j)[0] = (uint)in[i+0];
14812 (out+j)[1] = (uint)in[i+1];
14813 (out+j)[2] = (uint)in[i+2];
14814 (out
14826 uint *out = (uint*)_out;
14830 (out+j)[0] = (uint)in[i+0];
14831 (out+j)[1] = (uint)in[i+1];
14832 (out+j)[2] = (uint)in[i+2];
14833 (out+j)[3] = (uint)in[i+3];
14834 (out+j)[4] = (uint)in[i+4];
14835 (out+j)[5] = (uint)in[i+5];
14847 uint *out = (uint*)_out;
14853 (out+j)[0] = (uint)in[i+0];
14854 (out+j)[1] = (uint)in[i+1];
14855 (out+j)[2] = (uint)in[i+2];
14856 (out+j)[3] = (uint)in[i+3];
14857 (out+j)[4] = (uint)in[i+4];
14858 (out+j)[5] = (uint)in[i+5];
14861 (out+j)[0] = (uint)in[i+2];
14862 (out+j)[1] = (uint)in[i-2];
14863 (out+j)[2] = (uint)in[i+0];
14864 (out+j)[3] = (uint)in[i+3];
14865 (out+j)[4] = (uint)in[i+4];
14866 (out+j)[5] = (uint)in[i+6];
14879 uint *out = (uint*)_out;
14883 (out+j)[0] = (uint)in[i];
14895 uint *out = (uint*)_out;
14899 (out+j)[0] = (uint)in[i+1];
14900 (out+j)[1] = (uint)in[i];
14912 uint *out = (uint*)_out;
14916 (out+j)[0] = (uint)in[i+1];
14917 (out+j)[1] = (uint)in[i];
14929 uint *out = (uint*)_out;
14933 (out+j)[0] = (uint)in[i+1];
14934 (out+j)[1] = (uint)in[i];
14936 (out+j)[0] = (uint)in[start];
14937 (out+j)[1] = (uint)in[i];
14948 uint *out = (uint*)_out;
14952 (out+j)[0] = (uint)in[i+1];
14953 (out+j)[1] = (uint)in[i+2];
14954 (out+j)[2] = (uint)in[i];
14966 uint *out = (uint*)_out;
14970 (out+j)[0] = (uint)in[i+1+(i&1)];
14971 (out+j)[1] = (uint)in[i+2-(i&1)];
14972 (out+j)[2] = (uint)in[i];
14984 uint *out = (uint*)_out;
14988 (out+j)[0] = (uint)in[i+1];
14989 (out+j)[1] = (uint)in[i+2];
14990 (out+j)[2] = (uint)in[start];
15002 uint *out = (uint*)_out;
15006 (out+j+0)[0] = (uint)in[i+1];
15007 (out+j+0)[1] = (uint)in[i+2];
15008 (out+j+0)[2] = (uint)in[i+0];
15009 (out+j+3)[0] = (uint)in[i+2];
15010 (out+j+3)[1] = (uint)in[i+3];
15011 (out+j+3)[2] = (uint)in[i+0];
15023 uint *out = (uint*)_out;
15027 (out+j+0)[0] = (uint)in[i+1];
15028 (out+j+0)[1] = (uint)in[i+3];
15029 (out+j+0)[2] = (uint)in[i+0];
15030 (out+j+3)[0] = (uint)in[i+3];
15031 (out+j+3)[1] = (uint)in[i+2];
15032 (out+j+3)[2] = (uint)in[i+0];
15044 uint *out = (uint*)_out;
15048 (out+j)[0] = (uint)in[i+1];
15049 (out+j)[1] = (uint)in[i+2];
15050 (out+j)[2] = (uint)in[start];
15062 uint *out = (uint*)_out;
15066 (out+j)[0] = (uint)in[i+3];
15067 (out+j)[1] = (uint)in[i+2];
15068 (out+j)[2] = (uint)in[i+1];
15069 (out+j)[3] = (uint)in[i+0];
15081 uint *out = (uint*)_out;
15085 (out+j)[0] = (uint)in[i+3];
15086 (out+j)[1] = (uint)in[i+2];
15087 (out+j)[2] = (uint)in[i+1];
15088 (out+j)[3] = (uint)in[i+0];
15100 uint *out = (uint*)_out;
15104 (out+j)[0] = (uint)in[i+4];
15105 (out+j)[1] = (uint)in[i+5];
15106 (out+j)[2] = (uint)in[i+0];
15107 (out+j)[3] = (uint)in[i+1];
15108 (out+j)[4] = (uint)in[i+2];
15109 (out+j)[5] = (uint)in[i+3];
15121 uint *out = (uint*)_out;
15127 (out+j)[0] = (uint)in[i+4];
15128 (out+j)[1] = (uint)in[i+5];
15129 (out+j)[2] = (uint)in[i+0];
15130 (out+j)[3] = (uint)in[i+1];
15131 (out+j)[4] = (uint)in[i+2];
15132 (out+j)[5] = (uint)in[i+3];
15135 (out+j)[0] = (uint)in[i+4];
15136 (out+j)[1] = (uint)in[i+6];
15137 (out+j)[2] = (uint)in[i+2];
15138 (out+j)[3] = (uint)in[i-2];
15139 (out+j)[4] = (uint)in[i+0];
15140 (out+j)[5] = (uint)in[i+3];
15153 uint *out = (uint*)_out;
15157 (out+j)[0] = (uint)in[i];
15169 uint *out = (uint*)_out;
15173 (out+j)[0] = (uint)in[i+1];
15174 (out+j)[1] = (uint)in[i];
15186 uint *out = (uint*)_out;
15190 (out+j)[0] = (uint)in[i+1];
15191 (out+j)[1] = (uint)in[i];
15203 uint *out = (uint*)_out;
15207 (out+j)[0] = (uint)in[i+1];
15208 (out+j)[1] = (uint)in[i];
15210 (out+j)[0] = (uint)in[start];
15211 (out+j)[1] = (uint)in[i];
15222 uint *out = (uint*)_out;
15226 (out+j)[0] = (uint)in[i+1];
15227 (out+j)[1] = (uint)in[i+2];
15228 (out+j)[2] = (uint)in[i];
15240 uint *out = (uint*)_out;
15244 (out+j)[0] = (uint)in[i+1+(i&1)];
15245 (out+j)[1] = (uint)in[i+2-(i&1)];
15246 (out+j)[2] = (uint)in[i];
15258 uint *out = (uint*)_out;
15262 (out+j)[0] = (uint)in[i+1];
15263 (out+j)[1] = (uint)in[i+2];
15264 (out+j)[2] = (uint)in[start];
15276 uint *out = (uint*)_out;
15282 (out+j+0)[0] = restart_index;
15283 (out+j+0)[1] = restart_index;
15284 (out+j+0)[2] = restart_index;
15285 (out+j+3)[0] = restart_index;
15286 (out+j+3)[1] = restart_index;
15287 (out+j+3)[2] = restart_index;
15306 (out+j+0)[0] = (uint)in[i+1];
15307 (out+j+0)[1] = (uint)in[i+2];
15308 (out+j+0)[2] = (uint)in[i+0];
15309 (out+j+3)[0] = (uint)in[i+2];
15310 (out+j+3)[1] = (uint)in[i+3];
15311 (out+j+3)[2] = (uint)in[i+0];
15323 uint *out = (uint*)_out;
15329 (out+j+0)[0] = restart_index;
15330 (out+j+0)[1] = restart_index;
15331 (out+j+0)[2] = restart_index;
15332 (out+j+3)[0] = restart_index;
15333 (out+j+3)[1] = restart_index;
15334 (out+j+3)[2] = restart_index;
15353 (out+j+0)[0] = (uint)in[i+1];
15354 (out+j+0)[1] = (uint)in[i+3];
15355 (out+j+0)[2] = (uint)in[i+0];
15356 (out+j+3)[0] = (uint)in[i+3];
15357 (out+j+3)[1] = (uint)in[i+2];
15358 (out+j+3)[2] = (uint)in[i+0];
15370 uint *out = (uint*)_out;
15376 (out+j+0)[0] = restart_index;
15377 (out+j+0)[1] = restart_index;
15378 (out+j+0)[2] = restart_index;
15396 (out+j)[0] = (uint)in[i+1];
15397 (out+j)[1] = (uint)in[i+2];
15398 (out+j)[2] = (uint)in[start];
15410 uint *out = (uint*)_out;
15414 (out+j)[0] = (uint)in[i+3];
15415 (out+j)[1] = (uint)in[i+2];
15416 (out+j)[2] = (uint)in[i+1];
15417 (out+j)[3] = (uint)in[i+0];
15429 uint *out = (uint*)_out;
15433 (out+j)[0] = (uint)in[i+3];
15434 (out+j)[1] = (uint)in[i+2];
15435 (out+j)[2] = (uint)in[i+1];
15436 (out+j)[3] = (uint)in[i+0];
15448 uint *out = (uint*)_out;
15452 (out+j)[0] = (uint)in[i+4];
15453 (out+j)[1] = (uint)in[i+5];
15454 (out+j)[2] = (uint)in[i+0];
15455 (out+j)[3] = (uint)in[i+1];
15456 (out+j)[4] = (uint)in[i+2];
15457 (out+j)[5] = (uint)in[i+3];
15469 uint *out = (uint*)_out;
15475 (out+j)[0] = (uint)in[i+4];
15476 (out+j)[1] = (uint)in[i+5];
15477 (out+j)[2] = (uint)in[i+0];
15478 (out+j)[3] = (uint)in[i+1];
15479 (out+j)[4] = (uint)in[i+2];
15480 (out+j)[5] = (uint)in[i+3];
15483 (out+j)[0] = (uint)in[i+4];
15484 (out+j)[1] = (uint)in[i+6];
15485 (out+j)[2] = (uint)in[i+2];
15486 (out+j)[3] = (uint)in[i-2];
15487 (out+j)[4] = (uint)in[i+0];
15488 (out+j)[5] = (uint)in[i+3];
15501 uint *out = (uint*)_out;
15505 (out+j)[0] = (uint)in[i];
15517 uint *out = (uint*)_out;
15521 (out+j)[0] = (uint)in[i+1];
15522 (out+j)[1] = (uint)in[i];
15534 uint *out = (uint*)_out;
15538 (out+j)[0] = (uint)in[i+1];
15539 (out+j)[1] = (uint)in[i];
15551 uint *out = (uint*)_out;
15555 (out+j)[0] = (uint)in[i+1];
15556 (out+j)[1] = (uint)in[i];
15558 (out+j)[0] = (uint)in[start];
15559 (out+j)[1] = (uint)in[i];
15570 uint *out = (uint*)_out;
15574 (out+j)[0] = (uint)in[i+2];
15575 (out+j)[1] = (uint)in[i];
15576 (out+j)[2] = (uint)in[i+1];
15588 uint *out = (uint*)_out;
15592 (out+j)[0] = (uint)in[i+2];
15593 (out+j)[1] = (uint)in[i+(i&1)];
15594 (out+j)[2] = (uint)in[i+1-(i&1)];
15606 uint *out = (uint*)_out;
15610 (out+j)[0] = (uint)in[i+2];
15611 (out+j)[1] = (uint)in[start];
15612 (out+j)[2] = (uint)in[i+1];
15624 uint *out = (uint*)_out;
15628 (out+j+0)[0] = (uint)in[i+3];
15629 (out+j+0)[1] = (uint)in[i+0];
15630 (out+j+0)[2] = (uint)in[i+1];
15631 (out+j+3)[0] = (uint)in[i+3];
15632 (out+j+3)[1] = (uint)in[i+1];
15633 (out+j+3)[2] = (uint)in[i+2];
15645 uint *out = (uint*)_out;
15649 (out+j+0)[0] = (uint)in[i+3];
15650 (out+j+0)[1] = (uint)in[i+2];
15651 (out+j+0)[2] = (uint)in[i+0];
15652 (out+j+3)[0] = (uint)in[i+3];
15653 (out+j+3)[1] = (uint)in[i+0];
15654 (out+j+3)[2] = (uint)in[i+1];
15666 uint *out = (uint*)_out;
15670 (out+j)[0] = (uint)in[start];
15671 (out+j)[1] = (uint)in[i+1];
15672 (out+j)[2] = (uint)in[i+2];
15684 uint *out = (uint*)_out;
15688 (out+j)[0] = (uint)in[i+3];
15689 (out+j)[1] = (uint)in[i+2];
15690 (out+j)[2] = (uint)in[i+1];
15691 (out+j)[3] = (uint)in[i+0];
15703 uint *out = (uint*)_out;
15707 (out+j)[0] = (uint)in[i+3];
15708 (out+j)[1] = (uint)in[i+2];
15709 (out+j)[2] = (uint)in[i+1];
15710 (out+j)[3] = (uint)in[i+0];
15722 uint *out = (uint*)_out;
15726 (out+j)[0] = (uint)in[i+4];
15727 (out+j)[1] = (uint)in[i+5];
15728 (out+j)[2] = (uint)in[i+0];
15729 (out+j)[3] = (uint)in[i+1];
15730 (out+j)[4] = (uint)in[i+2];
15731 (out+j)[5] = (uint)in[i+3];
15743 uint *out = (uint*)_out;
15749 (out+j)[0] = (uint)in[i+4];
15750 (out+j)[1] = (uint)in[i+5];
15751 (out+j)[2] = (uint)in[i+0];
15752 (out+j)[3] = (uint)in[i+1];
15753 (out+j)[4] = (uint)in[i+2];
15754 (out+j)[5] = (uint)in[i+3];
15757 (out+j)[0] = (uint)in[i+4];
15758 (out+j)[1] = (uint)in[i+6];
15759 (out+j)[2] = (uint)in[i+2];
15760 (out+j)[3] = (uint)in[i-2];
15761 (out+j)[4] = (uint)in[i+0];
15762 (out+j)[5] = (uint)in[i+3];
15775 uint *out = (uint*)_out;
15779 (out+j)[0] = (uint)in[i];
15791 uint *out = (uint*)_out;
15795 (out+j)[0] = (uint)in[i+1];
15796 (out+j)[1] = (uint)in[i];
15808 uint *out = (uint*)_out;
15812 (out+j)[0] = (uint)in[i+1];
15813 (out+j)[1] = (uint)in[i];
15825 uint *out = (uint*)_out;
15829 (out+j)[0] = (uint)in[i+1];
15830 (out+j)[1] = (uint)in[i];
15832 (out+j)[0] = (uint)in[start];
15833 (out+j)[1] = (uint)in[i];
15844 uint *out = (uint*)_out;
15848 (out+j)[0] = (uint)in[i+2];
15849 (out+j)[1] = (uint)in[i];
15850 (out+j)[2] = (uint)in[i+1];
15862 uint *out = (uint*)_out;
15866 (out+j)[0] = (uint)in[i+2];
15867 (out+j)[1] = (uint)in[i+(i&1)];
15868 (out+j)[2] = (uint)in[i+1-(i&1)];
15880 uint *out = (uint*)_out;
15884 (out+j)[0] = (uint)in[i+2];
15885 (out+j)[1] = (uint)in[start];
15886 (out+j)[2] = (uint)in[i+1];
15898 uint *out = (uint*)_out;
15904 (out+j+0)[0] = restart_index;
15905 (out+j+0)[1] = restart_index;
15906 (out+j+0)[2] = restart_index;
15907 (out+j+3)[0] = restart_index;
15908 (out+j+3)[1] = restart_index;
15909 (out+j+3)[2] = restart_index;
15928 (out+j+0)[0] = (uint)in[i+3];
15929 (out+j+0)[1] = (uint)in[i+0];
15930 (out+j+0)[2] = (uint)in[i+1];
15931 (out+j+3)[0] = (uint)in[i+3];
15932 (out+j+3)[1] = (uint)in[i+1];
15933 (out+j+3)[2] = (uint)in[i+2];
15945 uint *out = (uint*)_out;
15951 (out+j+0)[0] = restart_index;
15952 (out+j+0)[1] = restart_index;
15953 (out+j+0)[2] = restart_index;
15954 (out+j+3)[0] = restart_index;
15955 (out+j+3)[1] = restart_index;
15956 (out+j+3)[2] = restart_index;
15975 (out+j+0)[0] = (uint)in[i+3];
15976 (out+j+0)[1] = (uint)in[i+2];
15977 (out+j+0)[2] = (uint)in[i+0];
15978 (out+j+3)[0] = (uint)in[i+3];
15979 (out+j+3)[1] = (uint)in[i+0];
15980 (out+j+3)[2] = (uint)in[i+1];
15992 uint *out = (uint*)_out;
15998 (out+j+0)[0] = restart_index;
15999 (out+j+0)[1] = restart_index;
16000 (out+j+0)[2] = restart_index;
16018 (out+j)[0] = (uint)in[start];
16019 (out+j)[1] = (uint)in[i+1];
16020 (out+j)[2] = (uint)in[i+2];
16032 uint *out = (uint*)_out;
16036 (out+j)[0] = (uint)in[i+3];
16037 (out+j)[1] = (uint)in[i+2];
16038 (out+j)[2] = (uint)in[i+1];
16039 (out+j)[3] = (uint)in[i+0];
16051 uint *out = (uint*)_out;
16055 (out+j)[0] = (uint)in[i+3];
16056 (out+j)[1] = (uint)in[i+2];
16057 (out+j)[2] = (uint)in[i+1];
16058 (out+j)[3] = (uint)in[i+0];
16070 uint *out = (uint*)_out;
16074 (out+j)[0] = (uint)in[i+4];
16075 (out+j)[1] = (uint)in[i+5];
16076 (out+j)[2] = (uint)in[i+0];
16077 (out+j)[3] = (uint)in[i+1];
16078 (out+j)[4] = (uint)in[i+2];
16079 (out+j)[5] = (uint)in[i+3];
16091 uint *out = (uint*)_out;
16097 (out+j)[0] = (uint)in[i+4];
16098 (out+j)[1] = (uint)in[i+5];
16099 (out+j)[2] = (uint)in[i+0];
16100 (out+j)[3] = (uint)in[i+1];
16101 (out+j)[4] = (uint)in[i+2];
16102 (out+j)[5] = (uint)in[i+3];
16105 (out+j)[0] = (uint)in[i+4];
16106 (out+j)[1] = (uint)in[i+6];
16107 (out+j)[2] = (uint)in[i+2];
16108 (out+j)[3] = (uint)in[i-2];
16109 (out+j)[4] = (uint)in[i+0];
16110 (out+j)[5] = (uint)in[i+3];
16123 uint *out = (uint*)_out;
16127 (out+j)[0] = (uint)in[i];
16139 uint *out = (uint*)_out;
16143 (out+j)[0] = (uint)in[i];
16144 (out+j)[1] = (uint)in[i+1];
16156 uint *out = (uint*)_out;
16160 (out+j)[0] = (uint)in[i];
16161 (out+j)[1] = (uint)in[i+1];
16173 uint *out = (uint*)_out;
16177 (out+j)[0] = (uint)in[i];
16178 (out+j)[1] = (uint)in[i+1];
16180 (out+j)[0] = (uint)in[i];
16181 (out+j)[1] = (uint)in[start];
16192 uint *out = (uint*)_out;
16196 (out+j)[0] = (uint)in[i];
16197 (out+j)[1] = (uint)in[i+1];
16198 (out+j)[2] = (uint)in[i+2];
16210 uint *out = (uint*)_out;
16214 (out+j)[0] = (uint)in[i+(i&1)];
16215 (out+j)[1] = (uint)in[i+1-(i&1)];
16216 (out+j)[2] = (uint)in[i+2];
16228 uint *out = (uint*)_out;
16232 (out+j)[0] = (uint)in[start];
16233 (out+j)[1] = (uint)in[i+1];
16234 (out+j)[2] = (uint)in[i+2];
16246 uint *out = (uint*)_out;
16250 (out+j+0)[0] = (uint)in[i+0];
16251 (out+j+0)[1] = (uint)in[i+1];
16252 (out+j+0)[2] = (uint)in[i+3];
16253 (out+j+3)[0] = (uint)in[i+1];
16254 (out+j+3)[1] = (uint)in[i+2];
16255 (out+j+3)[2] = (uint)in[i+3];
16267 uint *out = (uint*)_out;
16271 (out+j+0)[0] = (uint)in[i+2];
16272 (out+j+0)[1] = (uint)in[i+0];
16273 (out+j+0)[2] = (uint)in[i+3];
16274 (out+j+3)[0] = (uint)in[i+0];
16275 (out+j+3)[1] = (uint)in[i+1];
16276 (out+j+3)[2] = (uint)in[i+3];
16288 uint *out = (uint*)_out;
16292 (out+j)[0] = (uint)in[i+1];
16293 (out+j)[1] = (uint)in[i+2];
16294 (out+j)[2] = (uint)in[start];
16306 uint *out = (uint*)_out;
16310 (out+j)[0] = (uint)in[i+0];
16311 (out+j)[1] = (uint)in[i+1];
16312 (out+j)[2] = (uint)in[i+2];
16313 (out+j)[3] = (uint)in[i+3];
16325 uint *out = (uint*)_out;
16329 (out+j)[0] = (uint)in[i+0];
16330 (out+j)[1] = (uint)in[i+1];
16331 (out+j)[2] = (uint)in[i+2];
16332 (out+j)[3] = (uint)in[i+3];
16344 uint *out = (uint*)_out;
16348 (out+j)[0] = (uint)in[i+0];
16349 (out+j)[1] = (uint)in[i+1];
16350 (out+j)[2] = (uint)in[i+2];
16351 (out+j)[3] = (uint)in[i+3];
16352 (out+j)[4] = (uint)in[i+4];
16353 (out+j)[5] = (uint)in[i+5];
16365 uint *out = (uint*)_out;
16371 (out+j)[0] = (uint)in[i+0];
16372 (out+j)[1] = (uint)in[i+1];
16373 (out+j)[2] = (uint)in[i+2];
16374 (out+j)[3] = (uint)in[i+3];
16375 (out+j)[4] = (uint)in[i+4];
16376 (out+j)[5] = (uint)in[i+5];
16379 (out+j)[0] = (uint)in[i+2];
16380 (out+j)[1] = (uint)in[i-2];
16381 (out+j)[2] = (uint)in[i+0];
16382 (out+j)[3] = (uint)in[i+3];
16383 (out+j)[4] = (uint)in[i+4];
16384 (out+j)[5] = (uint)in[i+6];
16397 uint *out = (uint*)_out;
16401 (out+j)[0] = (uint)in[i];
16413 uint *out = (uint*)_out;
16417 (out+j)[0] = (uint)in[i];
16418 (out+j)[1] = (uint)in[i+1];
16430 uint *out = (uint*)_out;
16434 (out+j)[0] = (uint)in[i];
16435 (out+j)[1] = (uint)in[i+1];
16447 uint *out = (uint*)_out;
16451 (out+j)[0] = (uint)in[i];
16452 (out+j)[1] = (uint)in[i+1];
16454 (out+j)[0] = (uint)in[i];
16455 (out+j)[1] = (uint)in[start];
16466 uint *out = (uint*)_out;
16470 (out+j)[0] = (uint)in[i];
16471 (out+j)[1] = (uint)in[i+1];
16472 (out+j)[2] = (uint)in[i+2];
16484 uint *out = (uint*)_out;
16488 (out+j)[0] = (uint)in[i+(i&1)];
16489 (out+j)[1] = (uint)in[i+1-(i&1)];
16490 (out+j)[2] = (uint)in[i+2];
16502 uint *out = (uint*)_out;
16506 (out+j)[0] = (uint)in[start];
16507 (out+j)[1] = (uint)in[i+1];
16508 (out+j)[2] = (uint)in[i+2];
16520 uint *out = (uint*)_out;
16526 (out+j+0)[0] = restart_index;
16527 (out+j+0)[1] = restart_index;
16528 (out+j+0)[2] = restart_index;
16529 (out+j+3)[0] = restart_index;
16530 (out+j+3)[1] = restart_index;
16531 (out+j+3)[2] = restart_index;
16550 (out+j+0)[0] = (uint)in[i+0];
16551 (out+j+0)[1] = (uint)in[i+1];
16552 (out+j+0)[2] = (uint)in[i+3];
16553 (out+j+3)[0] = (uint)in[i+1];
16554 (out+j+3)[1] = (uint)in[i+2];
16555 (out+j+3)[2] = (uint)in[i+3];
16567 uint *out = (uint*)_out;
16573 (out+j+0)[0] = restart_index;
16574 (out+j+0)[1] = restart_index;
16575 (out+j+0)[2] = restart_index;
16576 (out+j+3)[0] = restart_index;
16577 (out+j+3)[1] = restart_index;
16578 (out+j+3)[2] = restart_index;
16597 (out+j+0)[0] = (uint)in[i+2];
16598 (out+j+0)[1] = (uint)in[i+0];
16599 (out+j+0)[2] = (uint)in[i+3];
16600 (out+j+3)[0] = (uint)in[i+0];
16601 (out+j+3)[1] = (uint)in[i+1];
16602 (out+j+3)[2] = (uint)in[i+3];
16614 uint *out = (uint*)_out;
16620 (out+j+0)[0] = restart_index;
16621 (out+j+0)[1] = restart_index;
16622 (out+j+0)[2] = restart_index;
16640 (out+j)[0] = (uint)in[i+1];
16641 (out+j)[1] = (uint)in[i+2];
16642 (out+j)[2] = (uint)in[start];
16654 uint *out = (uint*)_out;
16658 (out+j)[0] = (uint)in[i+0];
16659 (out+j)[1] = (uint)in[i+1];
16660 (out+j)[2] = (uint)in[i+2];
16661 (out+j)[3] = (uint)in[i+3];
16673 uint *out = (uint*)_out;
16677 (out+j)[0] = (uint)in[i+0];
16678 (out+j)[1] = (uint)in[i+1];
16679 (out+j)[2] = (uint)in[i+2];
16680 (out+j)[3] = (uint)in[i+3];
16692 uint *out = (uint*)_out;
16696 (out+j)[0] = (uint)in[i+0];
16697 (out+j)[1] = (uint)in[i+1];
16698 (out+j)[2] = (uint)in[i+2];
16699 (out+j)[3] = (uint)in[i+3];
16700 (out+j)[4] = (uint)in[i+4];
16701 (out+j)[5] = (uint)in[i+5];
16713 uint *out = (uint*)_out;
16719 (out+j)[0] = (uint)in[i+0];
16720 (out+j)[1] = (uint)in[i+1];
16721 (out+j)[2] = (uint)in[i+2];
16722 (out+j)[3] = (uint)in[i+3];
16723 (out+j)[4] = (uint)in[i+4];
16724 (out+j)[5] = (uint)in[i+5];
16727 (out+j)[0] = (uint)in[i+2];
16728 (out+j)[1] = (uint)in[i-2];
16729 (out+j)[2] = (uint)in[i+0];
16730 (out+j)[3] = (uint)in[i+3];
16731 (out+j)[4] = (uint)in[i+4];
16732 (out+j)[5] = (uint)in[i+6];