1 // SPDX-License-Identifier: 0BSD 2 3 /////////////////////////////////////////////////////////////////////////////// 4 // 5 /// \file tuklib_integer.h 6 /// \brief Various integer and bit operations 7 /// 8 /// This file provides macros or functions to do some basic integer and bit 9 /// operations. 10 /// 11 /// Native endian inline functions (XX = 16, 32, or 64): 12 /// - Unaligned native endian reads: readXXne(ptr) 13 /// - Unaligned native endian writes: writeXXne(ptr, num) 14 /// - Aligned native endian reads: aligned_readXXne(ptr) 15 /// - Aligned native endian writes: aligned_writeXXne(ptr, num) 16 /// 17 /// Endianness-converting integer operations (these can be macros!) 18 /// (XX = 16, 32, or 64; Y = b or l): 19 /// - Byte swapping: byteswapXX(num) 20 /// - Byte order conversions to/from native (byteswaps if Y isn't 21 /// the native endianness): convXXYe(num) 22 /// - Unaligned reads: readXXYe(ptr) 23 /// - Unaligned writes: writeXXYe(ptr, num) 24 /// - Aligned reads: aligned_readXXYe(ptr) 25 /// - Aligned writes: aligned_writeXXYe(ptr, num) 26 /// 27 /// Since the above can macros, the arguments should have no side effects 28 /// because they may be evaluated more than once. 29 /// 30 /// Bit scan operations for non-zero 32-bit integers (inline functions): 31 /// - Bit scan reverse (find highest non-zero bit): bsr32(num) 32 /// - Count leading zeros: clz32(num) 33 /// - Count trailing zeros: ctz32(num) 34 /// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 35 /// 36 /// The above bit scan operations return 0-31. If num is zero, 37 /// the result is undefined. 38 // 39 // Authors: Lasse Collin 40 // Joachim Henke 41 // 42 /////////////////////////////////////////////////////////////////////////////// 43 44 #ifndef TUKLIB_INTEGER_H 45 #define TUKLIB_INTEGER_H 46 47 #include "tuklib_common.h" 48 #include <string.h> 49 50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 51 // and such functions. 52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 53 # include <immintrin.h> 54 // Only include <intrin.h> when it is needed. GCC and Clang can both 55 // use __builtin's, so we only need Windows instrincs when using MSVC. 56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 57 // cases explicitly. 58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 59 # include <intrin.h> 60 #endif 61 62 63 /////////////////// 64 // Byte swapping // 65 /////////////////// 66 67 #if defined(HAVE___BUILTIN_BSWAPXX) 68 // GCC >= 4.8 and Clang 69 # define byteswap16(num) __builtin_bswap16(num) 70 # define byteswap32(num) __builtin_bswap32(num) 71 # define byteswap64(num) __builtin_bswap64(num) 72 73 #elif defined(HAVE_BYTESWAP_H) 74 // glibc, uClibc, dietlibc 75 # include <byteswap.h> 76 # ifdef HAVE_BSWAP_16 77 # define byteswap16(num) bswap_16(num) 78 # endif 79 # ifdef HAVE_BSWAP_32 80 # define byteswap32(num) bswap_32(num) 81 # endif 82 # ifdef HAVE_BSWAP_64 83 # define byteswap64(num) bswap_64(num) 84 # endif 85 86 #elif defined(HAVE_SYS_ENDIAN_H) 87 // *BSDs and Darwin 88 # include <sys/endian.h> 89 # ifdef __OpenBSD__ 90 # define byteswap16(num) swap16(num) 91 # define byteswap32(num) swap32(num) 92 # define byteswap64(num) swap64(num) 93 # else 94 # define byteswap16(num) bswap16(num) 95 # define byteswap32(num) bswap32(num) 96 # define byteswap64(num) bswap64(num) 97 # endif 98 99 #elif defined(HAVE_SYS_BYTEORDER_H) 100 // Solaris 101 # include <sys/byteorder.h> 102 # ifdef BSWAP_16 103 # define byteswap16(num) BSWAP_16(num) 104 # endif 105 # ifdef BSWAP_32 106 # define byteswap32(num) BSWAP_32(num) 107 # endif 108 # ifdef BSWAP_64 109 # define byteswap64(num) BSWAP_64(num) 110 # endif 111 # ifdef BE_16 112 # define conv16be(num) BE_16(num) 113 # endif 114 # ifdef BE_32 115 # define conv32be(num) BE_32(num) 116 # endif 117 # ifdef BE_64 118 # define conv64be(num) BE_64(num) 119 # endif 120 # ifdef LE_16 121 # define conv16le(num) LE_16(num) 122 # endif 123 # ifdef LE_32 124 # define conv32le(num) LE_32(num) 125 # endif 126 # ifdef LE_64 127 # define conv64le(num) LE_64(num) 128 # endif 129 #endif 130 131 #ifndef byteswap16 132 # define byteswap16(n) (uint16_t)( \ 133 (((n) & 0x00FFU) << 8) \ 134 | (((n) & 0xFF00U) >> 8) \ 135 ) 136 #endif 137 138 #ifndef byteswap32 139 # define byteswap32(n) (uint32_t)( \ 140 (((n) & UINT32_C(0x000000FF)) << 24) \ 141 | (((n) & UINT32_C(0x0000FF00)) << 8) \ 142 | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 143 | (((n) & UINT32_C(0xFF000000)) >> 24) \ 144 ) 145 #endif 146 147 #ifndef byteswap64 148 # define byteswap64(n) (uint64_t)( \ 149 (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 150 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 151 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 152 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 153 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 154 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 155 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 156 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 157 ) 158 #endif 159 160 // Define conversion macros using the basic byte swapping macros. 161 #ifdef WORDS_BIGENDIAN 162 # ifndef conv16be 163 # define conv16be(num) ((uint16_t)(num)) 164 # endif 165 # ifndef conv32be 166 # define conv32be(num) ((uint32_t)(num)) 167 # endif 168 # ifndef conv64be 169 # define conv64be(num) ((uint64_t)(num)) 170 # endif 171 # ifndef conv16le 172 # define conv16le(num) byteswap16(num) 173 # endif 174 # ifndef conv32le 175 # define conv32le(num) byteswap32(num) 176 # endif 177 # ifndef conv64le 178 # define conv64le(num) byteswap64(num) 179 # endif 180 #else 181 # ifndef conv16be 182 # define conv16be(num) byteswap16(num) 183 # endif 184 # ifndef conv32be 185 # define conv32be(num) byteswap32(num) 186 # endif 187 # ifndef conv64be 188 # define conv64be(num) byteswap64(num) 189 # endif 190 # ifndef conv16le 191 # define conv16le(num) ((uint16_t)(num)) 192 # endif 193 # ifndef conv32le 194 # define conv32le(num) ((uint32_t)(num)) 195 # endif 196 # ifndef conv64le 197 # define conv64le(num) ((uint64_t)(num)) 198 # endif 199 #endif 200 201 202 //////////////////////////////// 203 // Unaligned reads and writes // 204 //////////////////////////////// 205 206 // No-strict-align archs like x86-64 207 // --------------------------------- 208 // 209 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 210 // is bad even if the uint8_pointer is properly aligned because this kind 211 // of casts break strict aliasing rules and result in undefined behavior. 212 // With unaligned pointers it's even worse: compilers may emit vector 213 // instructions that require aligned pointers even if non-vector 214 // instructions work with unaligned pointers. 215 // 216 // Using memcpy() is the standard compliant way to do unaligned access. 217 // Many modern compilers inline it so there is no function call overhead. 218 // For those compilers that don't handle the memcpy() method well, the 219 // old casting method (that violates strict aliasing) can be requested at 220 // build time. A third method, casting to a packed struct, would also be 221 // an option but isn't provided to keep things simpler (it's already a mess). 222 // Hopefully this is flexible enough in practice. 223 // 224 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 225 // 226 // buf[0] | (buf[1] << 8) 227 // 228 // reads a 16-bit value and can emit a single 16-bit load and produce 229 // identical code than with the memcpy() method. In other cases Clang and GCC 230 // produce either the same or better code with memcpy(). For example, Clang 9 231 // on x86-64 can detect 32-bit load but not 16-bit load. 232 // 233 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 234 // code for "buf[0] | (buf[1] << 8)". 235 // 236 // Conclusion: The memcpy() method is the best choice when unaligned access 237 // is supported. 238 // 239 // Strict-align archs like SPARC 240 // ----------------------------- 241 // 242 // GCC versions from around 4.x to to at least 13.2.0 produce worse code 243 // from the memcpy() method than from simple byte-by-byte shift-or code 244 // when reading a 32-bit integer: 245 // 246 // (1) It may be constructed on stack using four 8-bit loads, 247 // four 8-bit stores to stack, and finally one 32-bit load from stack. 248 // 249 // (2) Especially with -Os, an actual memcpy() call may be emitted. 250 // 251 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 252 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 253 // some processors but not all so this is relevant only in the case when 254 // GCC assumes that unaligned is not supported or -mstrict-align or 255 // -mno-unaligned-access is used. 256 // 257 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align 258 // was one the very few with a minor difference: the memcpy() version 259 // was one instruction longer. 260 // 261 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is 262 // the best choice for strict-align archs to do unaligned access. 263 // 264 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 265 // 266 // Thanks to <https://godbolt.org/> it was easy to test different compilers. 267 // The following is for little endian targets: 268 /* 269 #include <stdint.h> 270 #include <string.h> 271 272 uint32_t bytes16(const uint8_t *b) 273 { 274 return (uint32_t)b[0] 275 | ((uint32_t)b[1] << 8); 276 } 277 278 uint32_t copy16(const uint8_t *b) 279 { 280 uint16_t v; 281 memcpy(&v, b, sizeof(v)); 282 return v; 283 } 284 285 uint32_t bytes32(const uint8_t *b) 286 { 287 return (uint32_t)b[0] 288 | ((uint32_t)b[1] << 8) 289 | ((uint32_t)b[2] << 16) 290 | ((uint32_t)b[3] << 24); 291 } 292 293 uint32_t copy32(const uint8_t *b) 294 { 295 uint32_t v; 296 memcpy(&v, b, sizeof(v)); 297 return v; 298 } 299 300 void wbytes16(uint8_t *b, uint16_t v) 301 { 302 b[0] = (uint8_t)v; 303 b[1] = (uint8_t)(v >> 8); 304 } 305 306 void wcopy16(uint8_t *b, uint16_t v) 307 { 308 memcpy(b, &v, sizeof(v)); 309 } 310 311 void wbytes32(uint8_t *b, uint32_t v) 312 { 313 b[0] = (uint8_t)v; 314 b[1] = (uint8_t)(v >> 8); 315 b[2] = (uint8_t)(v >> 16); 316 b[3] = (uint8_t)(v >> 24); 317 } 318 319 void wcopy32(uint8_t *b, uint32_t v) 320 { 321 memcpy(b, &v, sizeof(v)); 322 } 323 */ 324 325 326 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS 327 328 static inline uint16_t 329 read16ne(const uint8_t *buf) 330 { 331 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 332 return *(const uint16_t *)buf; 333 #else 334 uint16_t num; 335 memcpy(&num, buf, sizeof(num)); 336 return num; 337 #endif 338 } 339 340 341 static inline uint32_t 342 read32ne(const uint8_t *buf) 343 { 344 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 345 return *(const uint32_t *)buf; 346 #else 347 uint32_t num; 348 memcpy(&num, buf, sizeof(num)); 349 return num; 350 #endif 351 } 352 353 354 static inline uint64_t 355 read64ne(const uint8_t *buf) 356 { 357 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 358 return *(const uint64_t *)buf; 359 #else 360 uint64_t num; 361 memcpy(&num, buf, sizeof(num)); 362 return num; 363 #endif 364 } 365 366 367 static inline void 368 write16ne(uint8_t *buf, uint16_t num) 369 { 370 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 371 *(uint16_t *)buf = num; 372 #else 373 memcpy(buf, &num, sizeof(num)); 374 #endif 375 return; 376 } 377 378 379 static inline void 380 write32ne(uint8_t *buf, uint32_t num) 381 { 382 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 383 *(uint32_t *)buf = num; 384 #else 385 memcpy(buf, &num, sizeof(num)); 386 #endif 387 return; 388 } 389 390 391 static inline void 392 write64ne(uint8_t *buf, uint64_t num) 393 { 394 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 395 *(uint64_t *)buf = num; 396 #else 397 memcpy(buf, &num, sizeof(num)); 398 #endif 399 return; 400 } 401 402 403 static inline uint16_t 404 read16be(const uint8_t *buf) 405 { 406 uint16_t num = read16ne(buf); 407 return conv16be(num); 408 } 409 410 411 static inline uint16_t 412 read16le(const uint8_t *buf) 413 { 414 uint16_t num = read16ne(buf); 415 return conv16le(num); 416 } 417 418 419 static inline uint32_t 420 read32be(const uint8_t *buf) 421 { 422 uint32_t num = read32ne(buf); 423 return conv32be(num); 424 } 425 426 427 static inline uint32_t 428 read32le(const uint8_t *buf) 429 { 430 uint32_t num = read32ne(buf); 431 return conv32le(num); 432 } 433 434 435 static inline uint64_t 436 read64be(const uint8_t *buf) 437 { 438 uint64_t num = read64ne(buf); 439 return conv64be(num); 440 } 441 442 443 static inline uint64_t 444 read64le(const uint8_t *buf) 445 { 446 uint64_t num = read64ne(buf); 447 return conv64le(num); 448 } 449 450 451 // NOTE: Possible byte swapping must be done in a macro to allow the compiler 452 // to optimize byte swapping of constants when using glibc's or *BSD's 453 // byte swapping macros. The actual write is done in an inline function 454 // to make type checking of the buf pointer possible. 455 #define write16be(buf, num) write16ne(buf, conv16be(num)) 456 #define write32be(buf, num) write32ne(buf, conv32be(num)) 457 #define write64be(buf, num) write64ne(buf, conv64be(num)) 458 #define write16le(buf, num) write16ne(buf, conv16le(num)) 459 #define write32le(buf, num) write32ne(buf, conv32le(num)) 460 #define write64le(buf, num) write64ne(buf, conv64le(num)) 461 462 #else 463 464 #ifdef WORDS_BIGENDIAN 465 # define read16ne read16be 466 # define read32ne read32be 467 # define read64ne read64be 468 # define write16ne write16be 469 # define write32ne write32be 470 # define write64ne write64be 471 #else 472 # define read16ne read16le 473 # define read32ne read32le 474 # define read64ne read64le 475 # define write16ne write16le 476 # define write32ne write32le 477 # define write64ne write64le 478 #endif 479 480 481 static inline uint16_t 482 read16be(const uint8_t *buf) 483 { 484 uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 485 return num; 486 } 487 488 489 static inline uint16_t 490 read16le(const uint8_t *buf) 491 { 492 uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 493 return num; 494 } 495 496 497 static inline uint32_t 498 read32be(const uint8_t *buf) 499 { 500 uint32_t num = (uint32_t)buf[0] << 24; 501 num |= (uint32_t)buf[1] << 16; 502 num |= (uint32_t)buf[2] << 8; 503 num |= (uint32_t)buf[3]; 504 return num; 505 } 506 507 508 static inline uint32_t 509 read32le(const uint8_t *buf) 510 { 511 uint32_t num = (uint32_t)buf[0]; 512 num |= (uint32_t)buf[1] << 8; 513 num |= (uint32_t)buf[2] << 16; 514 num |= (uint32_t)buf[3] << 24; 515 return num; 516 } 517 518 519 static inline uint64_t 520 read64be(const uint8_t *buf) 521 { 522 uint64_t num = (uint64_t)buf[0] << 56; 523 num |= (uint64_t)buf[1] << 48; 524 num |= (uint64_t)buf[2] << 40; 525 num |= (uint64_t)buf[3] << 32; 526 num |= (uint64_t)buf[4] << 24; 527 num |= (uint64_t)buf[5] << 16; 528 num |= (uint64_t)buf[6] << 8; 529 num |= (uint64_t)buf[7]; 530 return num; 531 } 532 533 534 static inline uint64_t 535 read64le(const uint8_t *buf) 536 { 537 uint64_t num = (uint64_t)buf[0]; 538 num |= (uint64_t)buf[1] << 8; 539 num |= (uint64_t)buf[2] << 16; 540 num |= (uint64_t)buf[3] << 24; 541 num |= (uint64_t)buf[4] << 32; 542 num |= (uint64_t)buf[5] << 40; 543 num |= (uint64_t)buf[6] << 48; 544 num |= (uint64_t)buf[7] << 56; 545 return num; 546 } 547 548 549 static inline void 550 write16be(uint8_t *buf, uint16_t num) 551 { 552 buf[0] = (uint8_t)(num >> 8); 553 buf[1] = (uint8_t)num; 554 return; 555 } 556 557 558 static inline void 559 write16le(uint8_t *buf, uint16_t num) 560 { 561 buf[0] = (uint8_t)num; 562 buf[1] = (uint8_t)(num >> 8); 563 return; 564 } 565 566 567 static inline void 568 write32be(uint8_t *buf, uint32_t num) 569 { 570 buf[0] = (uint8_t)(num >> 24); 571 buf[1] = (uint8_t)(num >> 16); 572 buf[2] = (uint8_t)(num >> 8); 573 buf[3] = (uint8_t)num; 574 return; 575 } 576 577 578 static inline void 579 write32le(uint8_t *buf, uint32_t num) 580 { 581 buf[0] = (uint8_t)num; 582 buf[1] = (uint8_t)(num >> 8); 583 buf[2] = (uint8_t)(num >> 16); 584 buf[3] = (uint8_t)(num >> 24); 585 return; 586 } 587 588 589 static inline void 590 write64be(uint8_t *buf, uint64_t num) 591 { 592 buf[0] = (uint8_t)(num >> 56); 593 buf[1] = (uint8_t)(num >> 48); 594 buf[2] = (uint8_t)(num >> 40); 595 buf[3] = (uint8_t)(num >> 32); 596 buf[4] = (uint8_t)(num >> 24); 597 buf[5] = (uint8_t)(num >> 16); 598 buf[6] = (uint8_t)(num >> 8); 599 buf[7] = (uint8_t)num; 600 return; 601 } 602 603 604 static inline void 605 write64le(uint8_t *buf, uint64_t num) 606 { 607 buf[0] = (uint8_t)num; 608 buf[1] = (uint8_t)(num >> 8); 609 buf[2] = (uint8_t)(num >> 16); 610 buf[3] = (uint8_t)(num >> 24); 611 buf[4] = (uint8_t)(num >> 32); 612 buf[5] = (uint8_t)(num >> 40); 613 buf[6] = (uint8_t)(num >> 48); 614 buf[7] = (uint8_t)(num >> 56); 615 return; 616 } 617 618 #endif 619 620 621 ////////////////////////////// 622 // Aligned reads and writes // 623 ////////////////////////////// 624 625 // Separate functions for aligned reads and writes are provided since on 626 // strict-align archs aligned access is much faster than unaligned access. 627 // 628 // Just like in the unaligned case, memcpy() is needed to avoid 629 // strict aliasing violations. However, on archs that don't support 630 // unaligned access the compiler cannot know that the pointers given 631 // to memcpy() are aligned which results in slow code. As of C11 there is 632 // no standard way to tell the compiler that we know that the address is 633 // aligned but some compilers have language extensions to do that. With 634 // such language extensions the memcpy() method gives excellent results. 635 // 636 // What to do on a strict-align system when no known language extensions 637 // are available? Falling back to byte-by-byte access would be safe but ruin 638 // optimizations that have been made specifically with aligned access in mind. 639 // As a compromise, aligned reads will fall back to non-compliant type punning 640 // but aligned writes will be byte-by-byte, that is, fast reads are preferred 641 // over fast writes. This obviously isn't great but hopefully it's a working 642 // compromise for now. 643 // 644 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 645 #ifdef __lint__ 646 # define tuklib_assume_aligned(ptr, align) (const void *)(ptr) 647 #else 648 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED 649 # define tuklib_assume_aligned(ptr, align) \ 650 __builtin_assume_aligned(ptr, align) 651 #else 652 # define tuklib_assume_aligned(ptr, align) (ptr) 653 # ifndef TUKLIB_FAST_UNALIGNED_ACCESS 654 # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 655 # endif 656 #endif 657 #endif 658 659 660 static inline uint16_t 661 aligned_read16ne(const uint8_t *buf) 662 { 663 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 664 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 665 return *(const uint16_t *)buf; 666 #else 667 uint16_t num; 668 memcpy(&num, tuklib_assume_aligned(buf, sizeof(num)), sizeof(num)); 669 return num; 670 #endif 671 } 672 673 674 static inline uint32_t 675 aligned_read32ne(const uint8_t *buf) 676 { 677 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 678 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 679 return *(const uint32_t *)buf; 680 #else 681 uint32_t num; 682 memcpy(&num, tuklib_assume_aligned(buf, sizeof(num)), sizeof(num)); 683 return num; 684 #endif 685 } 686 687 688 static inline uint64_t 689 aligned_read64ne(const uint8_t *buf) 690 { 691 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 692 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 693 return *(const uint64_t *)buf; 694 #else 695 uint64_t num; 696 memcpy(&num, tuklib_assume_aligned(buf, sizeof(num)), sizeof(num)); 697 return num; 698 #endif 699 } 700 701 702 static inline void 703 aligned_write16ne(uint8_t *buf, uint16_t num) 704 { 705 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 706 *(uint16_t *)buf = num; 707 #else 708 memcpy(tuklib_assume_aligned(buf, sizeof(num)), &num, sizeof(num)); 709 #endif 710 return; 711 } 712 713 714 static inline void 715 aligned_write32ne(uint8_t *buf, uint32_t num) 716 { 717 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 718 *(uint32_t *)buf = num; 719 #else 720 memcpy(tuklib_assume_aligned(buf, sizeof(num)), &num, sizeof(num)); 721 #endif 722 return; 723 } 724 725 726 static inline void 727 aligned_write64ne(uint8_t *buf, uint64_t num) 728 { 729 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 730 *(uint64_t *)buf = num; 731 #else 732 memcpy(tuklib_assume_aligned(buf, sizeof(num)), &num, sizeof(num)); 733 #endif 734 return; 735 } 736 737 738 static inline uint16_t 739 aligned_read16be(const uint8_t *buf) 740 { 741 uint16_t num = aligned_read16ne(buf); 742 return conv16be(num); 743 } 744 745 746 static inline uint16_t 747 aligned_read16le(const uint8_t *buf) 748 { 749 uint16_t num = aligned_read16ne(buf); 750 return conv16le(num); 751 } 752 753 754 static inline uint32_t 755 aligned_read32be(const uint8_t *buf) 756 { 757 uint32_t num = aligned_read32ne(buf); 758 return conv32be(num); 759 } 760 761 762 static inline uint32_t 763 aligned_read32le(const uint8_t *buf) 764 { 765 uint32_t num = aligned_read32ne(buf); 766 return conv32le(num); 767 } 768 769 770 static inline uint64_t 771 aligned_read64be(const uint8_t *buf) 772 { 773 uint64_t num = aligned_read64ne(buf); 774 return conv64be(num); 775 } 776 777 778 static inline uint64_t 779 aligned_read64le(const uint8_t *buf) 780 { 781 uint64_t num = aligned_read64ne(buf); 782 return conv64le(num); 783 } 784 785 786 // These need to be macros like in the unaligned case. 787 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 788 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 789 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 790 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 791 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 792 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 793 794 795 //////////////////// 796 // Bit operations // 797 //////////////////// 798 799 static inline uint32_t 800 bsr32(uint32_t n) 801 { 802 // Check for ICC first, since it tends to define __GNUC__ too. 803 #if defined(__INTEL_COMPILER) 804 return _bit_scan_reverse(n); 805 806 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 807 // GCC >= 3.4 has __builtin_clz(), which gives good results on 808 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 809 // either plain BSR (so the XOR gets optimized away) or LZCNT and 810 // XOR (if -march indicates that SSE4a instructions are supported). 811 return (uint32_t)__builtin_clz(n) ^ 31U; 812 813 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 814 uint32_t i; 815 __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 816 return i; 817 818 #elif defined(_MSC_VER) 819 unsigned long i; 820 _BitScanReverse(&i, n); 821 return i; 822 823 #else 824 uint32_t i = 31; 825 826 if ((n & 0xFFFF0000) == 0) { 827 n <<= 16; 828 i = 15; 829 } 830 831 if ((n & 0xFF000000) == 0) { 832 n <<= 8; 833 i -= 8; 834 } 835 836 if ((n & 0xF0000000) == 0) { 837 n <<= 4; 838 i -= 4; 839 } 840 841 if ((n & 0xC0000000) == 0) { 842 n <<= 2; 843 i -= 2; 844 } 845 846 if ((n & 0x80000000) == 0) 847 --i; 848 849 return i; 850 #endif 851 } 852 853 854 static inline uint32_t 855 clz32(uint32_t n) 856 { 857 #if defined(__INTEL_COMPILER) 858 return _bit_scan_reverse(n) ^ 31U; 859 860 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 861 return (uint32_t)__builtin_clz(n); 862 863 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 864 uint32_t i; 865 __asm__("bsrl %1, %0\n\t" 866 "xorl $31, %0" 867 : "=r" (i) : "rm" (n)); 868 return i; 869 870 #elif defined(_MSC_VER) 871 unsigned long i; 872 _BitScanReverse(&i, n); 873 return i ^ 31U; 874 875 #else 876 uint32_t i = 0; 877 878 if ((n & 0xFFFF0000) == 0) { 879 n <<= 16; 880 i = 16; 881 } 882 883 if ((n & 0xFF000000) == 0) { 884 n <<= 8; 885 i += 8; 886 } 887 888 if ((n & 0xF0000000) == 0) { 889 n <<= 4; 890 i += 4; 891 } 892 893 if ((n & 0xC0000000) == 0) { 894 n <<= 2; 895 i += 2; 896 } 897 898 if ((n & 0x80000000) == 0) 899 ++i; 900 901 return i; 902 #endif 903 } 904 905 906 static inline uint32_t 907 ctz32(uint32_t n) 908 { 909 #if defined(__INTEL_COMPILER) 910 return _bit_scan_forward(n); 911 912 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 913 return (uint32_t)__builtin_ctz(n); 914 915 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 916 uint32_t i; 917 __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 918 return i; 919 920 #elif defined(_MSC_VER) 921 unsigned long i; 922 _BitScanForward(&i, n); 923 return i; 924 925 #else 926 uint32_t i = 0; 927 928 if ((n & 0x0000FFFF) == 0) { 929 n >>= 16; 930 i = 16; 931 } 932 933 if ((n & 0x000000FF) == 0) { 934 n >>= 8; 935 i += 8; 936 } 937 938 if ((n & 0x0000000F) == 0) { 939 n >>= 4; 940 i += 4; 941 } 942 943 if ((n & 0x00000003) == 0) { 944 n >>= 2; 945 i += 2; 946 } 947 948 if ((n & 0x00000001) == 0) 949 ++i; 950 951 return i; 952 #endif 953 } 954 955 #define bsf32 ctz32 956 957 #endif 958