1 1.1 joerg /*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------=== 2 1.1 joerg * 3 1.1 joerg * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 1.1 joerg * See https://llvm.org/LICENSE.txt for license information. 5 1.1 joerg * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 1.1 joerg * 7 1.1 joerg *===-----------------------------------------------------------------------=== 8 1.1 joerg */ 9 1.1 joerg 10 1.1 joerg #ifndef __IMMINTRIN_H 11 1.1 joerg #error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead." 12 1.1 joerg #endif 13 1.1 joerg 14 1.1 joerg #ifndef __AVX512DQINTRIN_H 15 1.1 joerg #define __AVX512DQINTRIN_H 16 1.1 joerg 17 1.1 joerg /* Define the default attributes for the functions in this file. */ 18 1.1 joerg #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512))) 19 1.1 joerg #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"))) 20 1.1 joerg 21 1.1 joerg static __inline __mmask8 __DEFAULT_FN_ATTRS 22 1.1 joerg _knot_mask8(__mmask8 __M) 23 1.1 joerg { 24 1.1 joerg return __builtin_ia32_knotqi(__M); 25 1.1 joerg } 26 1.1 joerg 27 1.1 joerg static __inline__ __mmask8 __DEFAULT_FN_ATTRS 28 1.1 joerg _kand_mask8(__mmask8 __A, __mmask8 __B) 29 1.1 joerg { 30 1.1 joerg return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B); 31 1.1 joerg } 32 1.1 joerg 33 1.1 joerg static __inline__ __mmask8 __DEFAULT_FN_ATTRS 34 1.1 joerg _kandn_mask8(__mmask8 __A, __mmask8 __B) 35 1.1 joerg { 36 1.1 joerg return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B); 37 1.1 joerg } 38 1.1 joerg 39 1.1 joerg static __inline__ __mmask8 __DEFAULT_FN_ATTRS 40 1.1 joerg _kor_mask8(__mmask8 __A, __mmask8 __B) 41 1.1 joerg { 42 1.1 joerg return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B); 43 1.1 joerg } 44 1.1 joerg 45 1.1 joerg static __inline__ __mmask8 __DEFAULT_FN_ATTRS 46 1.1 joerg _kxnor_mask8(__mmask8 __A, __mmask8 __B) 47 1.1 joerg { 48 1.1 joerg return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B); 49 1.1 joerg } 50 1.1 joerg 51 1.1 joerg static __inline__ __mmask8 __DEFAULT_FN_ATTRS 52 1.1 joerg _kxor_mask8(__mmask8 __A, __mmask8 __B) 53 1.1 joerg { 54 1.1 joerg return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B); 55 1.1 joerg } 56 1.1 joerg 57 1.1 joerg static __inline__ unsigned char __DEFAULT_FN_ATTRS 58 1.1 joerg _kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) 59 1.1 joerg { 60 1.1 joerg return (unsigned char)__builtin_ia32_kortestcqi(__A, __B); 61 1.1 joerg } 62 1.1 joerg 63 1.1 joerg static __inline__ unsigned char __DEFAULT_FN_ATTRS 64 1.1 joerg _kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) 65 1.1 joerg { 66 1.1 joerg return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); 67 1.1 joerg } 68 1.1 joerg 69 1.1 joerg static __inline__ unsigned char __DEFAULT_FN_ATTRS 70 1.1 joerg _kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { 71 1.1 joerg *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B); 72 1.1 joerg return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); 73 1.1 joerg } 74 1.1 joerg 75 1.1 joerg static __inline__ unsigned char __DEFAULT_FN_ATTRS 76 1.1 joerg _ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) 77 1.1 joerg { 78 1.1 joerg return (unsigned char)__builtin_ia32_ktestcqi(__A, __B); 79 1.1 joerg } 80 1.1 joerg 81 1.1 joerg static __inline__ unsigned char __DEFAULT_FN_ATTRS 82 1.1 joerg _ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) 83 1.1 joerg { 84 1.1 joerg return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); 85 1.1 joerg } 86 1.1 joerg 87 1.1 joerg static __inline__ unsigned char __DEFAULT_FN_ATTRS 88 1.1 joerg _ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { 89 1.1 joerg *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B); 90 1.1 joerg return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); 91 1.1 joerg } 92 1.1 joerg 93 1.1 joerg static __inline__ unsigned char __DEFAULT_FN_ATTRS 94 1.1 joerg _ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) 95 1.1 joerg { 96 1.1 joerg return (unsigned char)__builtin_ia32_ktestchi(__A, __B); 97 1.1 joerg } 98 1.1 joerg 99 1.1 joerg static __inline__ unsigned char __DEFAULT_FN_ATTRS 100 1.1 joerg _ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) 101 1.1 joerg { 102 1.1 joerg return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); 103 1.1 joerg } 104 1.1 joerg 105 1.1 joerg static __inline__ unsigned char __DEFAULT_FN_ATTRS 106 1.1 joerg _ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { 107 1.1 joerg *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B); 108 1.1 joerg return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); 109 1.1 joerg } 110 1.1 joerg 111 1.1 joerg static __inline__ __mmask8 __DEFAULT_FN_ATTRS 112 1.1 joerg _kadd_mask8(__mmask8 __A, __mmask8 __B) 113 1.1 joerg { 114 1.1 joerg return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B); 115 1.1 joerg } 116 1.1 joerg 117 1.1 joerg static __inline__ __mmask16 __DEFAULT_FN_ATTRS 118 1.1 joerg _kadd_mask16(__mmask16 __A, __mmask16 __B) 119 1.1 joerg { 120 1.1 joerg return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B); 121 1.1 joerg } 122 1.1 joerg 123 1.1 joerg #define _kshiftli_mask8(A, I) \ 124 1.1 joerg (__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)) 125 1.1 joerg 126 1.1 joerg #define _kshiftri_mask8(A, I) \ 127 1.1 joerg (__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)) 128 1.1 joerg 129 1.1 joerg static __inline__ unsigned int __DEFAULT_FN_ATTRS 130 1.1 joerg _cvtmask8_u32(__mmask8 __A) { 131 1.1 joerg return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A); 132 1.1 joerg } 133 1.1 joerg 134 1.1 joerg static __inline__ __mmask8 __DEFAULT_FN_ATTRS 135 1.1 joerg _cvtu32_mask8(unsigned int __A) { 136 1.1 joerg return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A); 137 1.1 joerg } 138 1.1 joerg 139 1.1 joerg static __inline__ __mmask8 __DEFAULT_FN_ATTRS 140 1.1 joerg _load_mask8(__mmask8 *__A) { 141 1.1 joerg return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A); 142 1.1 joerg } 143 1.1 joerg 144 1.1 joerg static __inline__ void __DEFAULT_FN_ATTRS 145 1.1 joerg _store_mask8(__mmask8 *__A, __mmask8 __B) { 146 1.1 joerg *(__mmask8 *)__A = __builtin_ia32_kmovb((__mmask8)__B); 147 1.1 joerg } 148 1.1 joerg 149 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 150 1.1 joerg _mm512_mullo_epi64 (__m512i __A, __m512i __B) { 151 1.1 joerg return (__m512i) ((__v8du) __A * (__v8du) __B); 152 1.1 joerg } 153 1.1 joerg 154 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 155 1.1 joerg _mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { 156 1.1 joerg return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 157 1.1 joerg (__v8di)_mm512_mullo_epi64(__A, __B), 158 1.1 joerg (__v8di)__W); 159 1.1 joerg } 160 1.1 joerg 161 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 162 1.1 joerg _mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { 163 1.1 joerg return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, 164 1.1 joerg (__v8di)_mm512_mullo_epi64(__A, __B), 165 1.1 joerg (__v8di)_mm512_setzero_si512()); 166 1.1 joerg } 167 1.1 joerg 168 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 169 1.1 joerg _mm512_xor_pd(__m512d __A, __m512d __B) { 170 1.1 joerg return (__m512d)((__v8du)__A ^ (__v8du)__B); 171 1.1 joerg } 172 1.1 joerg 173 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 174 1.1 joerg _mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 175 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 176 1.1 joerg (__v8df)_mm512_xor_pd(__A, __B), 177 1.1 joerg (__v8df)__W); 178 1.1 joerg } 179 1.1 joerg 180 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 181 1.1 joerg _mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { 182 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 183 1.1 joerg (__v8df)_mm512_xor_pd(__A, __B), 184 1.1 joerg (__v8df)_mm512_setzero_pd()); 185 1.1 joerg } 186 1.1 joerg 187 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 188 1.1 joerg _mm512_xor_ps (__m512 __A, __m512 __B) { 189 1.1 joerg return (__m512)((__v16su)__A ^ (__v16su)__B); 190 1.1 joerg } 191 1.1 joerg 192 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 193 1.1 joerg _mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 194 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 195 1.1 joerg (__v16sf)_mm512_xor_ps(__A, __B), 196 1.1 joerg (__v16sf)__W); 197 1.1 joerg } 198 1.1 joerg 199 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 200 1.1 joerg _mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { 201 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 202 1.1 joerg (__v16sf)_mm512_xor_ps(__A, __B), 203 1.1 joerg (__v16sf)_mm512_setzero_ps()); 204 1.1 joerg } 205 1.1 joerg 206 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 207 1.1 joerg _mm512_or_pd(__m512d __A, __m512d __B) { 208 1.1 joerg return (__m512d)((__v8du)__A | (__v8du)__B); 209 1.1 joerg } 210 1.1 joerg 211 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 212 1.1 joerg _mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 213 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 214 1.1 joerg (__v8df)_mm512_or_pd(__A, __B), 215 1.1 joerg (__v8df)__W); 216 1.1 joerg } 217 1.1 joerg 218 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 219 1.1 joerg _mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { 220 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 221 1.1 joerg (__v8df)_mm512_or_pd(__A, __B), 222 1.1 joerg (__v8df)_mm512_setzero_pd()); 223 1.1 joerg } 224 1.1 joerg 225 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 226 1.1 joerg _mm512_or_ps(__m512 __A, __m512 __B) { 227 1.1 joerg return (__m512)((__v16su)__A | (__v16su)__B); 228 1.1 joerg } 229 1.1 joerg 230 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 231 1.1 joerg _mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 232 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 233 1.1 joerg (__v16sf)_mm512_or_ps(__A, __B), 234 1.1 joerg (__v16sf)__W); 235 1.1 joerg } 236 1.1 joerg 237 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 238 1.1 joerg _mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { 239 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 240 1.1 joerg (__v16sf)_mm512_or_ps(__A, __B), 241 1.1 joerg (__v16sf)_mm512_setzero_ps()); 242 1.1 joerg } 243 1.1 joerg 244 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 245 1.1 joerg _mm512_and_pd(__m512d __A, __m512d __B) { 246 1.1 joerg return (__m512d)((__v8du)__A & (__v8du)__B); 247 1.1 joerg } 248 1.1 joerg 249 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 250 1.1 joerg _mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 251 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 252 1.1 joerg (__v8df)_mm512_and_pd(__A, __B), 253 1.1 joerg (__v8df)__W); 254 1.1 joerg } 255 1.1 joerg 256 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 257 1.1 joerg _mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { 258 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 259 1.1 joerg (__v8df)_mm512_and_pd(__A, __B), 260 1.1 joerg (__v8df)_mm512_setzero_pd()); 261 1.1 joerg } 262 1.1 joerg 263 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 264 1.1 joerg _mm512_and_ps(__m512 __A, __m512 __B) { 265 1.1 joerg return (__m512)((__v16su)__A & (__v16su)__B); 266 1.1 joerg } 267 1.1 joerg 268 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 269 1.1 joerg _mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 270 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 271 1.1 joerg (__v16sf)_mm512_and_ps(__A, __B), 272 1.1 joerg (__v16sf)__W); 273 1.1 joerg } 274 1.1 joerg 275 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 276 1.1 joerg _mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { 277 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 278 1.1 joerg (__v16sf)_mm512_and_ps(__A, __B), 279 1.1 joerg (__v16sf)_mm512_setzero_ps()); 280 1.1 joerg } 281 1.1 joerg 282 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 283 1.1 joerg _mm512_andnot_pd(__m512d __A, __m512d __B) { 284 1.1 joerg return (__m512d)(~(__v8du)__A & (__v8du)__B); 285 1.1 joerg } 286 1.1 joerg 287 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 288 1.1 joerg _mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { 289 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 290 1.1 joerg (__v8df)_mm512_andnot_pd(__A, __B), 291 1.1 joerg (__v8df)__W); 292 1.1 joerg } 293 1.1 joerg 294 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 295 1.1 joerg _mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { 296 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 297 1.1 joerg (__v8df)_mm512_andnot_pd(__A, __B), 298 1.1 joerg (__v8df)_mm512_setzero_pd()); 299 1.1 joerg } 300 1.1 joerg 301 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 302 1.1 joerg _mm512_andnot_ps(__m512 __A, __m512 __B) { 303 1.1 joerg return (__m512)(~(__v16su)__A & (__v16su)__B); 304 1.1 joerg } 305 1.1 joerg 306 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 307 1.1 joerg _mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { 308 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 309 1.1 joerg (__v16sf)_mm512_andnot_ps(__A, __B), 310 1.1 joerg (__v16sf)__W); 311 1.1 joerg } 312 1.1 joerg 313 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 314 1.1 joerg _mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { 315 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, 316 1.1 joerg (__v16sf)_mm512_andnot_ps(__A, __B), 317 1.1 joerg (__v16sf)_mm512_setzero_ps()); 318 1.1 joerg } 319 1.1 joerg 320 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 321 1.1 joerg _mm512_cvtpd_epi64 (__m512d __A) { 322 1.1 joerg return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, 323 1.1 joerg (__v8di) _mm512_setzero_si512(), 324 1.1 joerg (__mmask8) -1, 325 1.1 joerg _MM_FROUND_CUR_DIRECTION); 326 1.1 joerg } 327 1.1 joerg 328 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 329 1.1 joerg _mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { 330 1.1 joerg return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, 331 1.1 joerg (__v8di) __W, 332 1.1 joerg (__mmask8) __U, 333 1.1 joerg _MM_FROUND_CUR_DIRECTION); 334 1.1 joerg } 335 1.1 joerg 336 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 337 1.1 joerg _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) { 338 1.1 joerg return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, 339 1.1 joerg (__v8di) _mm512_setzero_si512(), 340 1.1 joerg (__mmask8) __U, 341 1.1 joerg _MM_FROUND_CUR_DIRECTION); 342 1.1 joerg } 343 1.1 joerg 344 1.1 joerg #define _mm512_cvt_roundpd_epi64(A, R) \ 345 1.1 joerg (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ 346 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 347 1.1 joerg (__mmask8)-1, (int)(R)) 348 1.1 joerg 349 1.1 joerg #define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \ 350 1.1 joerg (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ 351 1.1 joerg (__v8di)(__m512i)(W), \ 352 1.1 joerg (__mmask8)(U), (int)(R)) 353 1.1 joerg 354 1.1 joerg #define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \ 355 1.1 joerg (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ 356 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 357 1.1 joerg (__mmask8)(U), (int)(R)) 358 1.1 joerg 359 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 360 1.1 joerg _mm512_cvtpd_epu64 (__m512d __A) { 361 1.1 joerg return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, 362 1.1 joerg (__v8di) _mm512_setzero_si512(), 363 1.1 joerg (__mmask8) -1, 364 1.1 joerg _MM_FROUND_CUR_DIRECTION); 365 1.1 joerg } 366 1.1 joerg 367 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 368 1.1 joerg _mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { 369 1.1 joerg return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, 370 1.1 joerg (__v8di) __W, 371 1.1 joerg (__mmask8) __U, 372 1.1 joerg _MM_FROUND_CUR_DIRECTION); 373 1.1 joerg } 374 1.1 joerg 375 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 376 1.1 joerg _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) { 377 1.1 joerg return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, 378 1.1 joerg (__v8di) _mm512_setzero_si512(), 379 1.1 joerg (__mmask8) __U, 380 1.1 joerg _MM_FROUND_CUR_DIRECTION); 381 1.1 joerg } 382 1.1 joerg 383 1.1 joerg #define _mm512_cvt_roundpd_epu64(A, R) \ 384 1.1 joerg (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ 385 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 386 1.1 joerg (__mmask8)-1, (int)(R)) 387 1.1 joerg 388 1.1 joerg #define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \ 389 1.1 joerg (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ 390 1.1 joerg (__v8di)(__m512i)(W), \ 391 1.1 joerg (__mmask8)(U), (int)(R)) 392 1.1 joerg 393 1.1 joerg #define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \ 394 1.1 joerg (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ 395 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 396 1.1 joerg (__mmask8)(U), (int)(R)) 397 1.1 joerg 398 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 399 1.1 joerg _mm512_cvtps_epi64 (__m256 __A) { 400 1.1 joerg return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, 401 1.1 joerg (__v8di) _mm512_setzero_si512(), 402 1.1 joerg (__mmask8) -1, 403 1.1 joerg _MM_FROUND_CUR_DIRECTION); 404 1.1 joerg } 405 1.1 joerg 406 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 407 1.1 joerg _mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { 408 1.1 joerg return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, 409 1.1 joerg (__v8di) __W, 410 1.1 joerg (__mmask8) __U, 411 1.1 joerg _MM_FROUND_CUR_DIRECTION); 412 1.1 joerg } 413 1.1 joerg 414 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 415 1.1 joerg _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) { 416 1.1 joerg return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, 417 1.1 joerg (__v8di) _mm512_setzero_si512(), 418 1.1 joerg (__mmask8) __U, 419 1.1 joerg _MM_FROUND_CUR_DIRECTION); 420 1.1 joerg } 421 1.1 joerg 422 1.1 joerg #define _mm512_cvt_roundps_epi64(A, R) \ 423 1.1 joerg (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ 424 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 425 1.1 joerg (__mmask8)-1, (int)(R)) 426 1.1 joerg 427 1.1 joerg #define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \ 428 1.1 joerg (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ 429 1.1 joerg (__v8di)(__m512i)(W), \ 430 1.1 joerg (__mmask8)(U), (int)(R)) 431 1.1 joerg 432 1.1 joerg #define _mm512_maskz_cvt_roundps_epi64(U, A, R) \ 433 1.1 joerg (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ 434 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 435 1.1 joerg (__mmask8)(U), (int)(R)) 436 1.1 joerg 437 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 438 1.1 joerg _mm512_cvtps_epu64 (__m256 __A) { 439 1.1 joerg return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, 440 1.1 joerg (__v8di) _mm512_setzero_si512(), 441 1.1 joerg (__mmask8) -1, 442 1.1 joerg _MM_FROUND_CUR_DIRECTION); 443 1.1 joerg } 444 1.1 joerg 445 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 446 1.1 joerg _mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { 447 1.1 joerg return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, 448 1.1 joerg (__v8di) __W, 449 1.1 joerg (__mmask8) __U, 450 1.1 joerg _MM_FROUND_CUR_DIRECTION); 451 1.1 joerg } 452 1.1 joerg 453 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 454 1.1 joerg _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) { 455 1.1 joerg return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, 456 1.1 joerg (__v8di) _mm512_setzero_si512(), 457 1.1 joerg (__mmask8) __U, 458 1.1 joerg _MM_FROUND_CUR_DIRECTION); 459 1.1 joerg } 460 1.1 joerg 461 1.1 joerg #define _mm512_cvt_roundps_epu64(A, R) \ 462 1.1 joerg (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ 463 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 464 1.1 joerg (__mmask8)-1, (int)(R)) 465 1.1 joerg 466 1.1 joerg #define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \ 467 1.1 joerg (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ 468 1.1 joerg (__v8di)(__m512i)(W), \ 469 1.1 joerg (__mmask8)(U), (int)(R)) 470 1.1 joerg 471 1.1 joerg #define _mm512_maskz_cvt_roundps_epu64(U, A, R) \ 472 1.1 joerg (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ 473 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 474 1.1 joerg (__mmask8)(U), (int)(R)) 475 1.1 joerg 476 1.1 joerg 477 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 478 1.1 joerg _mm512_cvtepi64_pd (__m512i __A) { 479 1.1 joerg return (__m512d)__builtin_convertvector((__v8di)__A, __v8df); 480 1.1 joerg } 481 1.1 joerg 482 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 483 1.1 joerg _mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) { 484 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 485 1.1 joerg (__v8df)_mm512_cvtepi64_pd(__A), 486 1.1 joerg (__v8df)__W); 487 1.1 joerg } 488 1.1 joerg 489 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 490 1.1 joerg _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) { 491 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 492 1.1 joerg (__v8df)_mm512_cvtepi64_pd(__A), 493 1.1 joerg (__v8df)_mm512_setzero_pd()); 494 1.1 joerg } 495 1.1 joerg 496 1.1 joerg #define _mm512_cvt_roundepi64_pd(A, R) \ 497 1.1 joerg (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ 498 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 499 1.1 joerg (__mmask8)-1, (int)(R)) 500 1.1 joerg 501 1.1 joerg #define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \ 502 1.1 joerg (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ 503 1.1 joerg (__v8df)(__m512d)(W), \ 504 1.1 joerg (__mmask8)(U), (int)(R)) 505 1.1 joerg 506 1.1 joerg #define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \ 507 1.1 joerg (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ 508 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 509 1.1 joerg (__mmask8)(U), (int)(R)) 510 1.1 joerg 511 1.1 joerg static __inline__ __m256 __DEFAULT_FN_ATTRS512 512 1.1 joerg _mm512_cvtepi64_ps (__m512i __A) { 513 1.1 joerg return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, 514 1.1 joerg (__v8sf) _mm256_setzero_ps(), 515 1.1 joerg (__mmask8) -1, 516 1.1 joerg _MM_FROUND_CUR_DIRECTION); 517 1.1 joerg } 518 1.1 joerg 519 1.1 joerg static __inline__ __m256 __DEFAULT_FN_ATTRS512 520 1.1 joerg _mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) { 521 1.1 joerg return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, 522 1.1 joerg (__v8sf) __W, 523 1.1 joerg (__mmask8) __U, 524 1.1 joerg _MM_FROUND_CUR_DIRECTION); 525 1.1 joerg } 526 1.1 joerg 527 1.1 joerg static __inline__ __m256 __DEFAULT_FN_ATTRS512 528 1.1 joerg _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) { 529 1.1 joerg return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, 530 1.1 joerg (__v8sf) _mm256_setzero_ps(), 531 1.1 joerg (__mmask8) __U, 532 1.1 joerg _MM_FROUND_CUR_DIRECTION); 533 1.1 joerg } 534 1.1 joerg 535 1.1 joerg #define _mm512_cvt_roundepi64_ps(A, R) \ 536 1.1 joerg (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ 537 1.1 joerg (__v8sf)_mm256_setzero_ps(), \ 538 1.1 joerg (__mmask8)-1, (int)(R)) 539 1.1 joerg 540 1.1 joerg #define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \ 541 1.1 joerg (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ 542 1.1 joerg (__v8sf)(__m256)(W), (__mmask8)(U), \ 543 1.1 joerg (int)(R)) 544 1.1 joerg 545 1.1 joerg #define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \ 546 1.1 joerg (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ 547 1.1 joerg (__v8sf)_mm256_setzero_ps(), \ 548 1.1 joerg (__mmask8)(U), (int)(R)) 549 1.1 joerg 550 1.1 joerg 551 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 552 1.1 joerg _mm512_cvttpd_epi64 (__m512d __A) { 553 1.1 joerg return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, 554 1.1 joerg (__v8di) _mm512_setzero_si512(), 555 1.1 joerg (__mmask8) -1, 556 1.1 joerg _MM_FROUND_CUR_DIRECTION); 557 1.1 joerg } 558 1.1 joerg 559 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 560 1.1 joerg _mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { 561 1.1 joerg return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, 562 1.1 joerg (__v8di) __W, 563 1.1 joerg (__mmask8) __U, 564 1.1 joerg _MM_FROUND_CUR_DIRECTION); 565 1.1 joerg } 566 1.1 joerg 567 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 568 1.1 joerg _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) { 569 1.1 joerg return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, 570 1.1 joerg (__v8di) _mm512_setzero_si512(), 571 1.1 joerg (__mmask8) __U, 572 1.1 joerg _MM_FROUND_CUR_DIRECTION); 573 1.1 joerg } 574 1.1 joerg 575 1.1 joerg #define _mm512_cvtt_roundpd_epi64(A, R) \ 576 1.1 joerg (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ 577 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 578 1.1 joerg (__mmask8)-1, (int)(R)) 579 1.1 joerg 580 1.1 joerg #define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \ 581 1.1 joerg (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ 582 1.1 joerg (__v8di)(__m512i)(W), \ 583 1.1 joerg (__mmask8)(U), (int)(R)) 584 1.1 joerg 585 1.1 joerg #define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \ 586 1.1 joerg (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ 587 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 588 1.1 joerg (__mmask8)(U), (int)(R)) 589 1.1 joerg 590 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 591 1.1 joerg _mm512_cvttpd_epu64 (__m512d __A) { 592 1.1 joerg return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, 593 1.1 joerg (__v8di) _mm512_setzero_si512(), 594 1.1 joerg (__mmask8) -1, 595 1.1 joerg _MM_FROUND_CUR_DIRECTION); 596 1.1 joerg } 597 1.1 joerg 598 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 599 1.1 joerg _mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { 600 1.1 joerg return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, 601 1.1 joerg (__v8di) __W, 602 1.1 joerg (__mmask8) __U, 603 1.1 joerg _MM_FROUND_CUR_DIRECTION); 604 1.1 joerg } 605 1.1 joerg 606 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 607 1.1 joerg _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) { 608 1.1 joerg return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, 609 1.1 joerg (__v8di) _mm512_setzero_si512(), 610 1.1 joerg (__mmask8) __U, 611 1.1 joerg _MM_FROUND_CUR_DIRECTION); 612 1.1 joerg } 613 1.1 joerg 614 1.1 joerg #define _mm512_cvtt_roundpd_epu64(A, R) \ 615 1.1 joerg (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ 616 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 617 1.1 joerg (__mmask8)-1, (int)(R)) 618 1.1 joerg 619 1.1 joerg #define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \ 620 1.1 joerg (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ 621 1.1 joerg (__v8di)(__m512i)(W), \ 622 1.1 joerg (__mmask8)(U), (int)(R)) 623 1.1 joerg 624 1.1 joerg #define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \ 625 1.1 joerg (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ 626 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 627 1.1 joerg (__mmask8)(U), (int)(R)) 628 1.1 joerg 629 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 630 1.1 joerg _mm512_cvttps_epi64 (__m256 __A) { 631 1.1 joerg return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, 632 1.1 joerg (__v8di) _mm512_setzero_si512(), 633 1.1 joerg (__mmask8) -1, 634 1.1 joerg _MM_FROUND_CUR_DIRECTION); 635 1.1 joerg } 636 1.1 joerg 637 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 638 1.1 joerg _mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { 639 1.1 joerg return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, 640 1.1 joerg (__v8di) __W, 641 1.1 joerg (__mmask8) __U, 642 1.1 joerg _MM_FROUND_CUR_DIRECTION); 643 1.1 joerg } 644 1.1 joerg 645 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 646 1.1 joerg _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) { 647 1.1 joerg return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, 648 1.1 joerg (__v8di) _mm512_setzero_si512(), 649 1.1 joerg (__mmask8) __U, 650 1.1 joerg _MM_FROUND_CUR_DIRECTION); 651 1.1 joerg } 652 1.1 joerg 653 1.1 joerg #define _mm512_cvtt_roundps_epi64(A, R) \ 654 1.1 joerg (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ 655 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 656 1.1 joerg (__mmask8)-1, (int)(R)) 657 1.1 joerg 658 1.1 joerg #define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \ 659 1.1 joerg (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ 660 1.1 joerg (__v8di)(__m512i)(W), \ 661 1.1 joerg (__mmask8)(U), (int)(R)) 662 1.1 joerg 663 1.1 joerg #define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \ 664 1.1 joerg (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ 665 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 666 1.1 joerg (__mmask8)(U), (int)(R)) 667 1.1 joerg 668 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 669 1.1 joerg _mm512_cvttps_epu64 (__m256 __A) { 670 1.1 joerg return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, 671 1.1 joerg (__v8di) _mm512_setzero_si512(), 672 1.1 joerg (__mmask8) -1, 673 1.1 joerg _MM_FROUND_CUR_DIRECTION); 674 1.1 joerg } 675 1.1 joerg 676 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 677 1.1 joerg _mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { 678 1.1 joerg return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, 679 1.1 joerg (__v8di) __W, 680 1.1 joerg (__mmask8) __U, 681 1.1 joerg _MM_FROUND_CUR_DIRECTION); 682 1.1 joerg } 683 1.1 joerg 684 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 685 1.1 joerg _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) { 686 1.1 joerg return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, 687 1.1 joerg (__v8di) _mm512_setzero_si512(), 688 1.1 joerg (__mmask8) __U, 689 1.1 joerg _MM_FROUND_CUR_DIRECTION); 690 1.1 joerg } 691 1.1 joerg 692 1.1 joerg #define _mm512_cvtt_roundps_epu64(A, R) \ 693 1.1 joerg (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ 694 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 695 1.1 joerg (__mmask8)-1, (int)(R)) 696 1.1 joerg 697 1.1 joerg #define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \ 698 1.1 joerg (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ 699 1.1 joerg (__v8di)(__m512i)(W), \ 700 1.1 joerg (__mmask8)(U), (int)(R)) 701 1.1 joerg 702 1.1 joerg #define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \ 703 1.1 joerg (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ 704 1.1 joerg (__v8di)_mm512_setzero_si512(), \ 705 1.1 joerg (__mmask8)(U), (int)(R)) 706 1.1 joerg 707 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 708 1.1 joerg _mm512_cvtepu64_pd (__m512i __A) { 709 1.1 joerg return (__m512d)__builtin_convertvector((__v8du)__A, __v8df); 710 1.1 joerg } 711 1.1 joerg 712 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 713 1.1 joerg _mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) { 714 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 715 1.1 joerg (__v8df)_mm512_cvtepu64_pd(__A), 716 1.1 joerg (__v8df)__W); 717 1.1 joerg } 718 1.1 joerg 719 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 720 1.1 joerg _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) { 721 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, 722 1.1 joerg (__v8df)_mm512_cvtepu64_pd(__A), 723 1.1 joerg (__v8df)_mm512_setzero_pd()); 724 1.1 joerg } 725 1.1 joerg 726 1.1 joerg #define _mm512_cvt_roundepu64_pd(A, R) \ 727 1.1 joerg (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ 728 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 729 1.1 joerg (__mmask8)-1, (int)(R)) 730 1.1 joerg 731 1.1 joerg #define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \ 732 1.1 joerg (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ 733 1.1 joerg (__v8df)(__m512d)(W), \ 734 1.1 joerg (__mmask8)(U), (int)(R)) 735 1.1 joerg 736 1.1 joerg 737 1.1 joerg #define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \ 738 1.1 joerg (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ 739 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 740 1.1 joerg (__mmask8)(U), (int)(R)) 741 1.1 joerg 742 1.1 joerg 743 1.1 joerg static __inline__ __m256 __DEFAULT_FN_ATTRS512 744 1.1 joerg _mm512_cvtepu64_ps (__m512i __A) { 745 1.1 joerg return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, 746 1.1 joerg (__v8sf) _mm256_setzero_ps(), 747 1.1 joerg (__mmask8) -1, 748 1.1 joerg _MM_FROUND_CUR_DIRECTION); 749 1.1 joerg } 750 1.1 joerg 751 1.1 joerg static __inline__ __m256 __DEFAULT_FN_ATTRS512 752 1.1 joerg _mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) { 753 1.1 joerg return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, 754 1.1 joerg (__v8sf) __W, 755 1.1 joerg (__mmask8) __U, 756 1.1 joerg _MM_FROUND_CUR_DIRECTION); 757 1.1 joerg } 758 1.1 joerg 759 1.1 joerg static __inline__ __m256 __DEFAULT_FN_ATTRS512 760 1.1 joerg _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { 761 1.1 joerg return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, 762 1.1 joerg (__v8sf) _mm256_setzero_ps(), 763 1.1 joerg (__mmask8) __U, 764 1.1 joerg _MM_FROUND_CUR_DIRECTION); 765 1.1 joerg } 766 1.1 joerg 767 1.1 joerg #define _mm512_cvt_roundepu64_ps(A, R) \ 768 1.1 joerg (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ 769 1.1 joerg (__v8sf)_mm256_setzero_ps(), \ 770 1.1 joerg (__mmask8)-1, (int)(R)) 771 1.1 joerg 772 1.1 joerg #define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \ 773 1.1 joerg (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ 774 1.1 joerg (__v8sf)(__m256)(W), (__mmask8)(U), \ 775 1.1 joerg (int)(R)) 776 1.1 joerg 777 1.1 joerg #define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \ 778 1.1 joerg (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ 779 1.1 joerg (__v8sf)_mm256_setzero_ps(), \ 780 1.1 joerg (__mmask8)(U), (int)(R)) 781 1.1 joerg 782 1.1 joerg #define _mm512_range_pd(A, B, C) \ 783 1.1 joerg (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ 784 1.1 joerg (__v8df)(__m512d)(B), (int)(C), \ 785 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 786 1.1 joerg (__mmask8)-1, \ 787 1.1 joerg _MM_FROUND_CUR_DIRECTION) 788 1.1 joerg 789 1.1 joerg #define _mm512_mask_range_pd(W, U, A, B, C) \ 790 1.1 joerg (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ 791 1.1 joerg (__v8df)(__m512d)(B), (int)(C), \ 792 1.1 joerg (__v8df)(__m512d)(W), (__mmask8)(U), \ 793 1.1 joerg _MM_FROUND_CUR_DIRECTION) 794 1.1 joerg 795 1.1 joerg #define _mm512_maskz_range_pd(U, A, B, C) \ 796 1.1 joerg (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ 797 1.1 joerg (__v8df)(__m512d)(B), (int)(C), \ 798 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 799 1.1 joerg (__mmask8)(U), \ 800 1.1 joerg _MM_FROUND_CUR_DIRECTION) 801 1.1 joerg 802 1.1 joerg #define _mm512_range_round_pd(A, B, C, R) \ 803 1.1 joerg (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ 804 1.1 joerg (__v8df)(__m512d)(B), (int)(C), \ 805 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 806 1.1 joerg (__mmask8)-1, (int)(R)) 807 1.1 joerg 808 1.1 joerg #define _mm512_mask_range_round_pd(W, U, A, B, C, R) \ 809 1.1 joerg (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ 810 1.1 joerg (__v8df)(__m512d)(B), (int)(C), \ 811 1.1 joerg (__v8df)(__m512d)(W), (__mmask8)(U), \ 812 1.1 joerg (int)(R)) 813 1.1 joerg 814 1.1 joerg #define _mm512_maskz_range_round_pd(U, A, B, C, R) \ 815 1.1 joerg (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ 816 1.1 joerg (__v8df)(__m512d)(B), (int)(C), \ 817 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 818 1.1 joerg (__mmask8)(U), (int)(R)) 819 1.1 joerg 820 1.1 joerg #define _mm512_range_ps(A, B, C) \ 821 1.1 joerg (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ 822 1.1 joerg (__v16sf)(__m512)(B), (int)(C), \ 823 1.1 joerg (__v16sf)_mm512_setzero_ps(), \ 824 1.1 joerg (__mmask16)-1, \ 825 1.1 joerg _MM_FROUND_CUR_DIRECTION) 826 1.1 joerg 827 1.1 joerg #define _mm512_mask_range_ps(W, U, A, B, C) \ 828 1.1 joerg (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ 829 1.1 joerg (__v16sf)(__m512)(B), (int)(C), \ 830 1.1 joerg (__v16sf)(__m512)(W), (__mmask16)(U), \ 831 1.1 joerg _MM_FROUND_CUR_DIRECTION) 832 1.1 joerg 833 1.1 joerg #define _mm512_maskz_range_ps(U, A, B, C) \ 834 1.1 joerg (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ 835 1.1 joerg (__v16sf)(__m512)(B), (int)(C), \ 836 1.1 joerg (__v16sf)_mm512_setzero_ps(), \ 837 1.1 joerg (__mmask16)(U), \ 838 1.1 joerg _MM_FROUND_CUR_DIRECTION) 839 1.1 joerg 840 1.1 joerg #define _mm512_range_round_ps(A, B, C, R) \ 841 1.1 joerg (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ 842 1.1 joerg (__v16sf)(__m512)(B), (int)(C), \ 843 1.1 joerg (__v16sf)_mm512_setzero_ps(), \ 844 1.1 joerg (__mmask16)-1, (int)(R)) 845 1.1 joerg 846 1.1 joerg #define _mm512_mask_range_round_ps(W, U, A, B, C, R) \ 847 1.1 joerg (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ 848 1.1 joerg (__v16sf)(__m512)(B), (int)(C), \ 849 1.1 joerg (__v16sf)(__m512)(W), (__mmask16)(U), \ 850 1.1 joerg (int)(R)) 851 1.1 joerg 852 1.1 joerg #define _mm512_maskz_range_round_ps(U, A, B, C, R) \ 853 1.1 joerg (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ 854 1.1 joerg (__v16sf)(__m512)(B), (int)(C), \ 855 1.1 joerg (__v16sf)_mm512_setzero_ps(), \ 856 1.1 joerg (__mmask16)(U), (int)(R)) 857 1.1 joerg 858 1.1 joerg #define _mm_range_round_ss(A, B, C, R) \ 859 1.1 joerg (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ 860 1.1 joerg (__v4sf)(__m128)(B), \ 861 1.1 joerg (__v4sf)_mm_setzero_ps(), \ 862 1.1 joerg (__mmask8) -1, (int)(C),\ 863 1.1 joerg (int)(R)) 864 1.1 joerg 865 1.1 joerg #define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION) 866 1.1 joerg 867 1.1 joerg #define _mm_mask_range_round_ss(W, U, A, B, C, R) \ 868 1.1 joerg (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ 869 1.1 joerg (__v4sf)(__m128)(B), \ 870 1.1 joerg (__v4sf)(__m128)(W),\ 871 1.1 joerg (__mmask8)(U), (int)(C),\ 872 1.1 joerg (int)(R)) 873 1.1 joerg 874 1.1 joerg #define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION) 875 1.1 joerg 876 1.1 joerg #define _mm_maskz_range_round_ss(U, A, B, C, R) \ 877 1.1 joerg (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ 878 1.1 joerg (__v4sf)(__m128)(B), \ 879 1.1 joerg (__v4sf)_mm_setzero_ps(), \ 880 1.1 joerg (__mmask8)(U), (int)(C),\ 881 1.1 joerg (int)(R)) 882 1.1 joerg 883 1.1 joerg #define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) 884 1.1 joerg 885 1.1 joerg #define _mm_range_round_sd(A, B, C, R) \ 886 1.1 joerg (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ 887 1.1 joerg (__v2df)(__m128d)(B), \ 888 1.1 joerg (__v2df)_mm_setzero_pd(), \ 889 1.1 joerg (__mmask8) -1, (int)(C),\ 890 1.1 joerg (int)(R)) 891 1.1 joerg 892 1.1 joerg #define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION) 893 1.1 joerg 894 1.1 joerg #define _mm_mask_range_round_sd(W, U, A, B, C, R) \ 895 1.1 joerg (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ 896 1.1 joerg (__v2df)(__m128d)(B), \ 897 1.1 joerg (__v2df)(__m128d)(W),\ 898 1.1 joerg (__mmask8)(U), (int)(C),\ 899 1.1 joerg (int)(R)) 900 1.1 joerg 901 1.1 joerg #define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION) 902 1.1 joerg 903 1.1 joerg #define _mm_maskz_range_round_sd(U, A, B, C, R) \ 904 1.1 joerg (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ 905 1.1 joerg (__v2df)(__m128d)(B), \ 906 1.1 joerg (__v2df)_mm_setzero_pd(), \ 907 1.1 joerg (__mmask8)(U), (int)(C),\ 908 1.1 joerg (int)(R)) 909 1.1 joerg 910 1.1 joerg #define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) 911 1.1 joerg 912 1.1 joerg #define _mm512_reduce_pd(A, B) \ 913 1.1 joerg (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ 914 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 915 1.1 joerg (__mmask8)-1, \ 916 1.1 joerg _MM_FROUND_CUR_DIRECTION) 917 1.1 joerg 918 1.1 joerg #define _mm512_mask_reduce_pd(W, U, A, B) \ 919 1.1 joerg (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ 920 1.1 joerg (__v8df)(__m512d)(W), \ 921 1.1 joerg (__mmask8)(U), \ 922 1.1 joerg _MM_FROUND_CUR_DIRECTION) 923 1.1 joerg 924 1.1 joerg #define _mm512_maskz_reduce_pd(U, A, B) \ 925 1.1 joerg (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ 926 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 927 1.1 joerg (__mmask8)(U), \ 928 1.1 joerg _MM_FROUND_CUR_DIRECTION) 929 1.1 joerg 930 1.1 joerg #define _mm512_reduce_ps(A, B) \ 931 1.1 joerg (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ 932 1.1 joerg (__v16sf)_mm512_setzero_ps(), \ 933 1.1 joerg (__mmask16)-1, \ 934 1.1 joerg _MM_FROUND_CUR_DIRECTION) 935 1.1 joerg 936 1.1 joerg #define _mm512_mask_reduce_ps(W, U, A, B) \ 937 1.1 joerg (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ 938 1.1 joerg (__v16sf)(__m512)(W), \ 939 1.1 joerg (__mmask16)(U), \ 940 1.1 joerg _MM_FROUND_CUR_DIRECTION) 941 1.1 joerg 942 1.1 joerg #define _mm512_maskz_reduce_ps(U, A, B) \ 943 1.1 joerg (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ 944 1.1 joerg (__v16sf)_mm512_setzero_ps(), \ 945 1.1 joerg (__mmask16)(U), \ 946 1.1 joerg _MM_FROUND_CUR_DIRECTION) 947 1.1 joerg 948 1.1 joerg #define _mm512_reduce_round_pd(A, B, R) \ 949 1.1 joerg (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ 950 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 951 1.1 joerg (__mmask8)-1, (int)(R)) 952 1.1 joerg 953 1.1 joerg #define _mm512_mask_reduce_round_pd(W, U, A, B, R) \ 954 1.1 joerg (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ 955 1.1 joerg (__v8df)(__m512d)(W), \ 956 1.1 joerg (__mmask8)(U), (int)(R)) 957 1.1 joerg 958 1.1 joerg #define _mm512_maskz_reduce_round_pd(U, A, B, R) \ 959 1.1 joerg (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ 960 1.1 joerg (__v8df)_mm512_setzero_pd(), \ 961 1.1 joerg (__mmask8)(U), (int)(R)) 962 1.1 joerg 963 1.1 joerg #define _mm512_reduce_round_ps(A, B, R) \ 964 1.1 joerg (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ 965 1.1 joerg (__v16sf)_mm512_setzero_ps(), \ 966 1.1 joerg (__mmask16)-1, (int)(R)) 967 1.1 joerg 968 1.1 joerg #define _mm512_mask_reduce_round_ps(W, U, A, B, R) \ 969 1.1 joerg (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ 970 1.1 joerg (__v16sf)(__m512)(W), \ 971 1.1 joerg (__mmask16)(U), (int)(R)) 972 1.1 joerg 973 1.1 joerg #define _mm512_maskz_reduce_round_ps(U, A, B, R) \ 974 1.1 joerg (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ 975 1.1 joerg (__v16sf)_mm512_setzero_ps(), \ 976 1.1 joerg (__mmask16)(U), (int)(R)) 977 1.1 joerg 978 1.1 joerg #define _mm_reduce_ss(A, B, C) \ 979 1.1 joerg (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ 980 1.1 joerg (__v4sf)(__m128)(B), \ 981 1.1 joerg (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ 982 1.1 joerg (int)(C), _MM_FROUND_CUR_DIRECTION) 983 1.1 joerg 984 1.1 joerg #define _mm_mask_reduce_ss(W, U, A, B, C) \ 985 1.1 joerg (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ 986 1.1 joerg (__v4sf)(__m128)(B), \ 987 1.1 joerg (__v4sf)(__m128)(W), (__mmask8)(U), \ 988 1.1 joerg (int)(C), _MM_FROUND_CUR_DIRECTION) 989 1.1 joerg 990 1.1 joerg #define _mm_maskz_reduce_ss(U, A, B, C) \ 991 1.1 joerg (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ 992 1.1 joerg (__v4sf)(__m128)(B), \ 993 1.1 joerg (__v4sf)_mm_setzero_ps(), \ 994 1.1 joerg (__mmask8)(U), (int)(C), \ 995 1.1 joerg _MM_FROUND_CUR_DIRECTION) 996 1.1 joerg 997 1.1 joerg #define _mm_reduce_round_ss(A, B, C, R) \ 998 1.1 joerg (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ 999 1.1 joerg (__v4sf)(__m128)(B), \ 1000 1.1 joerg (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ 1001 1.1 joerg (int)(C), (int)(R)) 1002 1.1 joerg 1003 1.1 joerg #define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ 1004 1.1 joerg (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ 1005 1.1 joerg (__v4sf)(__m128)(B), \ 1006 1.1 joerg (__v4sf)(__m128)(W), (__mmask8)(U), \ 1007 1.1 joerg (int)(C), (int)(R)) 1008 1.1 joerg 1009 1.1 joerg #define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ 1010 1.1 joerg (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ 1011 1.1 joerg (__v4sf)(__m128)(B), \ 1012 1.1 joerg (__v4sf)_mm_setzero_ps(), \ 1013 1.1 joerg (__mmask8)(U), (int)(C), (int)(R)) 1014 1.1 joerg 1015 1.1 joerg #define _mm_reduce_sd(A, B, C) \ 1016 1.1 joerg (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ 1017 1.1 joerg (__v2df)(__m128d)(B), \ 1018 1.1 joerg (__v2df)_mm_setzero_pd(), \ 1019 1.1 joerg (__mmask8)-1, (int)(C), \ 1020 1.1 joerg _MM_FROUND_CUR_DIRECTION) 1021 1.1 joerg 1022 1.1 joerg #define _mm_mask_reduce_sd(W, U, A, B, C) \ 1023 1.1 joerg (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ 1024 1.1 joerg (__v2df)(__m128d)(B), \ 1025 1.1 joerg (__v2df)(__m128d)(W), (__mmask8)(U), \ 1026 1.1 joerg (int)(C), _MM_FROUND_CUR_DIRECTION) 1027 1.1 joerg 1028 1.1 joerg #define _mm_maskz_reduce_sd(U, A, B, C) \ 1029 1.1 joerg (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ 1030 1.1 joerg (__v2df)(__m128d)(B), \ 1031 1.1 joerg (__v2df)_mm_setzero_pd(), \ 1032 1.1 joerg (__mmask8)(U), (int)(C), \ 1033 1.1 joerg _MM_FROUND_CUR_DIRECTION) 1034 1.1 joerg 1035 1.1 joerg #define _mm_reduce_round_sd(A, B, C, R) \ 1036 1.1 joerg (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ 1037 1.1 joerg (__v2df)(__m128d)(B), \ 1038 1.1 joerg (__v2df)_mm_setzero_pd(), \ 1039 1.1 joerg (__mmask8)-1, (int)(C), (int)(R)) 1040 1.1 joerg 1041 1.1 joerg #define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ 1042 1.1 joerg (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ 1043 1.1 joerg (__v2df)(__m128d)(B), \ 1044 1.1 joerg (__v2df)(__m128d)(W), (__mmask8)(U), \ 1045 1.1 joerg (int)(C), (int)(R)) 1046 1.1 joerg 1047 1.1 joerg #define _mm_maskz_reduce_round_sd(U, A, B, C, R) \ 1048 1.1 joerg (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ 1049 1.1 joerg (__v2df)(__m128d)(B), \ 1050 1.1 joerg (__v2df)_mm_setzero_pd(), \ 1051 1.1 joerg (__mmask8)(U), (int)(C), (int)(R)) 1052 1.1 joerg 1053 1.1 joerg static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 1054 1.1 joerg _mm512_movepi32_mask (__m512i __A) 1055 1.1 joerg { 1056 1.1 joerg return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A); 1057 1.1 joerg } 1058 1.1 joerg 1059 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1060 1.1 joerg _mm512_movm_epi32 (__mmask16 __A) 1061 1.1 joerg { 1062 1.1 joerg return (__m512i) __builtin_ia32_cvtmask2d512 (__A); 1063 1.1 joerg } 1064 1.1 joerg 1065 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1066 1.1 joerg _mm512_movm_epi64 (__mmask8 __A) 1067 1.1 joerg { 1068 1.1 joerg return (__m512i) __builtin_ia32_cvtmask2q512 (__A); 1069 1.1 joerg } 1070 1.1 joerg 1071 1.1 joerg static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 1072 1.1 joerg _mm512_movepi64_mask (__m512i __A) 1073 1.1 joerg { 1074 1.1 joerg return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); 1075 1.1 joerg } 1076 1.1 joerg 1077 1.1 joerg 1078 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 1079 1.1 joerg _mm512_broadcast_f32x2 (__m128 __A) 1080 1.1 joerg { 1081 1.1 joerg return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 1082 1.1 joerg 0, 1, 0, 1, 0, 1, 0, 1, 1083 1.1 joerg 0, 1, 0, 1, 0, 1, 0, 1); 1084 1.1 joerg } 1085 1.1 joerg 1086 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 1087 1.1 joerg _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) 1088 1.1 joerg { 1089 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 1090 1.1 joerg (__v16sf)_mm512_broadcast_f32x2(__A), 1091 1.1 joerg (__v16sf)__O); 1092 1.1 joerg } 1093 1.1 joerg 1094 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 1095 1.1 joerg _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) 1096 1.1 joerg { 1097 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 1098 1.1 joerg (__v16sf)_mm512_broadcast_f32x2(__A), 1099 1.1 joerg (__v16sf)_mm512_setzero_ps()); 1100 1.1 joerg } 1101 1.1 joerg 1102 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 1103 1.1 joerg _mm512_broadcast_f32x8(__m256 __A) 1104 1.1 joerg { 1105 1.1 joerg return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A, 1106 1.1 joerg 0, 1, 2, 3, 4, 5, 6, 7, 1107 1.1 joerg 0, 1, 2, 3, 4, 5, 6, 7); 1108 1.1 joerg } 1109 1.1 joerg 1110 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 1111 1.1 joerg _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) 1112 1.1 joerg { 1113 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 1114 1.1 joerg (__v16sf)_mm512_broadcast_f32x8(__A), 1115 1.1 joerg (__v16sf)__O); 1116 1.1 joerg } 1117 1.1 joerg 1118 1.1 joerg static __inline__ __m512 __DEFAULT_FN_ATTRS512 1119 1.1 joerg _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) 1120 1.1 joerg { 1121 1.1 joerg return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, 1122 1.1 joerg (__v16sf)_mm512_broadcast_f32x8(__A), 1123 1.1 joerg (__v16sf)_mm512_setzero_ps()); 1124 1.1 joerg } 1125 1.1 joerg 1126 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 1127 1.1 joerg _mm512_broadcast_f64x2(__m128d __A) 1128 1.1 joerg { 1129 1.1 joerg return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, 1130 1.1 joerg 0, 1, 0, 1, 0, 1, 0, 1); 1131 1.1 joerg } 1132 1.1 joerg 1133 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 1134 1.1 joerg _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) 1135 1.1 joerg { 1136 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 1137 1.1 joerg (__v8df)_mm512_broadcast_f64x2(__A), 1138 1.1 joerg (__v8df)__O); 1139 1.1 joerg } 1140 1.1 joerg 1141 1.1 joerg static __inline__ __m512d __DEFAULT_FN_ATTRS512 1142 1.1 joerg _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) 1143 1.1 joerg { 1144 1.1 joerg return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, 1145 1.1 joerg (__v8df)_mm512_broadcast_f64x2(__A), 1146 1.1 joerg (__v8df)_mm512_setzero_pd()); 1147 1.1 joerg } 1148 1.1 joerg 1149 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1150 1.1 joerg _mm512_broadcast_i32x2 (__m128i __A) 1151 1.1 joerg { 1152 1.1 joerg return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 1153 1.1 joerg 0, 1, 0, 1, 0, 1, 0, 1, 1154 1.1 joerg 0, 1, 0, 1, 0, 1, 0, 1); 1155 1.1 joerg } 1156 1.1 joerg 1157 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1158 1.1 joerg _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) 1159 1.1 joerg { 1160 1.1 joerg return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1161 1.1 joerg (__v16si)_mm512_broadcast_i32x2(__A), 1162 1.1 joerg (__v16si)__O); 1163 1.1 joerg } 1164 1.1 joerg 1165 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1166 1.1 joerg _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) 1167 1.1 joerg { 1168 1.1 joerg return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1169 1.1 joerg (__v16si)_mm512_broadcast_i32x2(__A), 1170 1.1 joerg (__v16si)_mm512_setzero_si512()); 1171 1.1 joerg } 1172 1.1 joerg 1173 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1174 1.1 joerg _mm512_broadcast_i32x8(__m256i __A) 1175 1.1 joerg { 1176 1.1 joerg return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A, 1177 1.1 joerg 0, 1, 2, 3, 4, 5, 6, 7, 1178 1.1 joerg 0, 1, 2, 3, 4, 5, 6, 7); 1179 1.1 joerg } 1180 1.1 joerg 1181 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1182 1.1 joerg _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) 1183 1.1 joerg { 1184 1.1 joerg return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1185 1.1 joerg (__v16si)_mm512_broadcast_i32x8(__A), 1186 1.1 joerg (__v16si)__O); 1187 1.1 joerg } 1188 1.1 joerg 1189 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1190 1.1 joerg _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) 1191 1.1 joerg { 1192 1.1 joerg return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, 1193 1.1 joerg (__v16si)_mm512_broadcast_i32x8(__A), 1194 1.1 joerg (__v16si)_mm512_setzero_si512()); 1195 1.1 joerg } 1196 1.1 joerg 1197 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1198 1.1 joerg _mm512_broadcast_i64x2(__m128i __A) 1199 1.1 joerg { 1200 1.1 joerg return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, 1201 1.1 joerg 0, 1, 0, 1, 0, 1, 0, 1); 1202 1.1 joerg } 1203 1.1 joerg 1204 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1205 1.1 joerg _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) 1206 1.1 joerg { 1207 1.1 joerg return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1208 1.1 joerg (__v8di)_mm512_broadcast_i64x2(__A), 1209 1.1 joerg (__v8di)__O); 1210 1.1 joerg } 1211 1.1 joerg 1212 1.1 joerg static __inline__ __m512i __DEFAULT_FN_ATTRS512 1213 1.1 joerg _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) 1214 1.1 joerg { 1215 1.1 joerg return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, 1216 1.1 joerg (__v8di)_mm512_broadcast_i64x2(__A), 1217 1.1 joerg (__v8di)_mm512_setzero_si512()); 1218 1.1 joerg } 1219 1.1 joerg 1220 1.1 joerg #define _mm512_extractf32x8_ps(A, imm) \ 1221 1.1 joerg (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ 1222 1.1 joerg (__v8sf)_mm256_undefined_ps(), \ 1223 1.1 joerg (__mmask8)-1) 1224 1.1 joerg 1225 1.1 joerg #define _mm512_mask_extractf32x8_ps(W, U, A, imm) \ 1226 1.1 joerg (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ 1227 1.1 joerg (__v8sf)(__m256)(W), \ 1228 1.1 joerg (__mmask8)(U)) 1229 1.1 joerg 1230 1.1 joerg #define _mm512_maskz_extractf32x8_ps(U, A, imm) \ 1231 1.1 joerg (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ 1232 1.1 joerg (__v8sf)_mm256_setzero_ps(), \ 1233 1.1 joerg (__mmask8)(U)) 1234 1.1 joerg 1235 1.1 joerg #define _mm512_extractf64x2_pd(A, imm) \ 1236 1.1 joerg (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ 1237 1.1 joerg (int)(imm), \ 1238 1.1 joerg (__v2df)_mm_undefined_pd(), \ 1239 1.1 joerg (__mmask8)-1) 1240 1.1 joerg 1241 1.1 joerg #define _mm512_mask_extractf64x2_pd(W, U, A, imm) \ 1242 1.1 joerg (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ 1243 1.1 joerg (int)(imm), \ 1244 1.1 joerg (__v2df)(__m128d)(W), \ 1245 1.1 joerg (__mmask8)(U)) 1246 1.1 joerg 1247 1.1 joerg #define _mm512_maskz_extractf64x2_pd(U, A, imm) \ 1248 1.1 joerg (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ 1249 1.1 joerg (int)(imm), \ 1250 1.1 joerg (__v2df)_mm_setzero_pd(), \ 1251 1.1 joerg (__mmask8)(U)) 1252 1.1 joerg 1253 1.1 joerg #define _mm512_extracti32x8_epi32(A, imm) \ 1254 1.1 joerg (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ 1255 1.1 joerg (__v8si)_mm256_undefined_si256(), \ 1256 1.1 joerg (__mmask8)-1) 1257 1.1 joerg 1258 1.1 joerg #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \ 1259 1.1 joerg (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ 1260 1.1 joerg (__v8si)(__m256i)(W), \ 1261 1.1 joerg (__mmask8)(U)) 1262 1.1 joerg 1263 1.1 joerg #define _mm512_maskz_extracti32x8_epi32(U, A, imm) \ 1264 1.1 joerg (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ 1265 1.1 joerg (__v8si)_mm256_setzero_si256(), \ 1266 1.1 joerg (__mmask8)(U)) 1267 1.1 joerg 1268 1.1 joerg #define _mm512_extracti64x2_epi64(A, imm) \ 1269 1.1 joerg (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ 1270 1.1 joerg (int)(imm), \ 1271 1.1 joerg (__v2di)_mm_undefined_si128(), \ 1272 1.1 joerg (__mmask8)-1) 1273 1.1 joerg 1274 1.1 joerg #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \ 1275 1.1 joerg (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ 1276 1.1 joerg (int)(imm), \ 1277 1.1 joerg (__v2di)(__m128i)(W), \ 1278 1.1 joerg (__mmask8)(U)) 1279 1.1 joerg 1280 1.1 joerg #define _mm512_maskz_extracti64x2_epi64(U, A, imm) \ 1281 1.1 joerg (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ 1282 1.1 joerg (int)(imm), \ 1283 1.1 joerg (__v2di)_mm_setzero_si128(), \ 1284 1.1 joerg (__mmask8)(U)) 1285 1.1 joerg 1286 1.1 joerg #define _mm512_insertf32x8(A, B, imm) \ 1287 1.1 joerg (__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \ 1288 1.1 joerg (__v8sf)(__m256)(B), (int)(imm)) 1289 1.1 joerg 1290 1.1 joerg #define _mm512_mask_insertf32x8(W, U, A, B, imm) \ 1291 1.1 joerg (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1292 1.1 joerg (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ 1293 1.1 joerg (__v16sf)(__m512)(W)) 1294 1.1 joerg 1295 1.1 joerg #define _mm512_maskz_insertf32x8(U, A, B, imm) \ 1296 1.1 joerg (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ 1297 1.1 joerg (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ 1298 1.1 joerg (__v16sf)_mm512_setzero_ps()) 1299 1.1 joerg 1300 1.1 joerg #define _mm512_insertf64x2(A, B, imm) \ 1301 1.1 joerg (__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \ 1302 1.1 joerg (__v2df)(__m128d)(B), (int)(imm)) 1303 1.1 joerg 1304 1.1 joerg #define _mm512_mask_insertf64x2(W, U, A, B, imm) \ 1305 1.1 joerg (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1306 1.1 joerg (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ 1307 1.1 joerg (__v8df)(__m512d)(W)) 1308 1.1 joerg 1309 1.1 joerg #define _mm512_maskz_insertf64x2(U, A, B, imm) \ 1310 1.1 joerg (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ 1311 1.1 joerg (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ 1312 1.1 joerg (__v8df)_mm512_setzero_pd()) 1313 1.1 joerg 1314 1.1 joerg #define _mm512_inserti32x8(A, B, imm) \ 1315 1.1 joerg (__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \ 1316 1.1 joerg (__v8si)(__m256i)(B), (int)(imm)) 1317 1.1 joerg 1318 1.1 joerg #define _mm512_mask_inserti32x8(W, U, A, B, imm) \ 1319 1.1 joerg (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 1320 1.1 joerg (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ 1321 1.1 joerg (__v16si)(__m512i)(W)) 1322 1.1 joerg 1323 1.1 joerg #define _mm512_maskz_inserti32x8(U, A, B, imm) \ 1324 1.1 joerg (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ 1325 1.1 joerg (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ 1326 1.1 joerg (__v16si)_mm512_setzero_si512()) 1327 1.1 joerg 1328 1.1 joerg #define _mm512_inserti64x2(A, B, imm) \ 1329 1.1 joerg (__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \ 1330 1.1 joerg (__v2di)(__m128i)(B), (int)(imm)) 1331 1.1 joerg 1332 1.1 joerg #define _mm512_mask_inserti64x2(W, U, A, B, imm) \ 1333 1.1 joerg (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 1334 1.1 joerg (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ 1335 1.1 joerg (__v8di)(__m512i)(W)) 1336 1.1 joerg 1337 1.1 joerg #define _mm512_maskz_inserti64x2(U, A, B, imm) \ 1338 1.1 joerg (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ 1339 1.1 joerg (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ 1340 1.1 joerg (__v8di)_mm512_setzero_si512()) 1341 1.1 joerg 1342 1.1 joerg #define _mm512_mask_fpclass_ps_mask(U, A, imm) \ 1343 1.1 joerg (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ 1344 1.1 joerg (int)(imm), (__mmask16)(U)) 1345 1.1 joerg 1346 1.1 joerg #define _mm512_fpclass_ps_mask(A, imm) \ 1347 1.1 joerg (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ 1348 1.1 joerg (int)(imm), (__mmask16)-1) 1349 1.1 joerg 1350 1.1 joerg #define _mm512_mask_fpclass_pd_mask(U, A, imm) \ 1351 1.1 joerg (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ 1352 1.1 joerg (__mmask8)(U)) 1353 1.1 joerg 1354 1.1 joerg #define _mm512_fpclass_pd_mask(A, imm) \ 1355 1.1 joerg (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ 1356 1.1 joerg (__mmask8)-1) 1357 1.1 joerg 1358 1.1 joerg #define _mm_fpclass_sd_mask(A, imm) \ 1359 1.1 joerg (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ 1360 1.1 joerg (__mmask8)-1) 1361 1.1 joerg 1362 1.1 joerg #define _mm_mask_fpclass_sd_mask(U, A, imm) \ 1363 1.1 joerg (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ 1364 1.1 joerg (__mmask8)(U)) 1365 1.1 joerg 1366 1.1 joerg #define _mm_fpclass_ss_mask(A, imm) \ 1367 1.1 joerg (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ 1368 1.1 joerg (__mmask8)-1) 1369 1.1 joerg 1370 1.1 joerg #define _mm_mask_fpclass_ss_mask(U, A, imm) \ 1371 1.1 joerg (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ 1372 1.1 joerg (__mmask8)(U)) 1373 1.1 joerg 1374 1.1 joerg #undef __DEFAULT_FN_ATTRS512 1375 1.1 joerg #undef __DEFAULT_FN_ATTRS 1376 1.1 joerg 1377 1.1 joerg #endif 1378