Home | History | Annotate | Line # | Download | only in common
      1 // SPDX-License-Identifier: 0BSD
      2 
      3 ///////////////////////////////////////////////////////////////////////////////
      4 //
      5 /// \file       tuklib_integer.h
      6 /// \brief      Various integer and bit operations
      7 ///
      8 /// This file provides macros or functions to do some basic integer and bit
      9 /// operations.
     10 ///
     11 /// Native endian inline functions (XX = 16, 32, or 64):
     12 ///   - Unaligned native endian reads: readXXne(ptr)
     13 ///   - Unaligned native endian writes: writeXXne(ptr, num)
     14 ///   - Aligned native endian reads: aligned_readXXne(ptr)
     15 ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
     16 ///
     17 /// Endianness-converting integer operations (these can be macros!)
     18 /// (XX = 16, 32, or 64; Y = b or l):
     19 ///   - Byte swapping: byteswapXX(num)
     20 ///   - Byte order conversions to/from native (byteswaps if Y isn't
     21 ///     the native endianness): convXXYe(num)
     22 ///   - Unaligned reads: readXXYe(ptr)
     23 ///   - Unaligned writes: writeXXYe(ptr, num)
     24 ///   - Aligned reads: aligned_readXXYe(ptr)
     25 ///   - Aligned writes: aligned_writeXXYe(ptr, num)
     26 ///
     27 /// Since the above can macros, the arguments should have no side effects
     28 /// because they may be evaluated more than once.
     29 ///
     30 /// Bit scan operations for non-zero 32-bit integers (inline functions):
     31 ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
     32 ///   - Count leading zeros: clz32(num)
     33 ///   - Count trailing zeros: ctz32(num)
     34 ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
     35 ///
     36 /// The above bit scan operations return 0-31. If num is zero,
     37 /// the result is undefined.
     38 //
     39 //  Authors:    Lasse Collin
     40 //              Joachim Henke
     41 //
     42 ///////////////////////////////////////////////////////////////////////////////
     43 
     44 #ifndef TUKLIB_INTEGER_H
     45 #define TUKLIB_INTEGER_H
     46 
     47 #include "tuklib_common.h"
     48 #include <string.h>
     49 
     50 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
     51 // and such functions.
     52 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
     53 #	include <immintrin.h>
     54 // Only include <intrin.h> when it is needed. GCC and Clang can both
     55 // use __builtin's, so we only need Windows instrincs when using MSVC.
     56 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
     57 // cases explicitly.
     58 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
     59 #	include <intrin.h>
     60 #endif
     61 
     62 
     63 ///////////////////
     64 // Byte swapping //
     65 ///////////////////
     66 
     67 #if defined(HAVE___BUILTIN_BSWAPXX)
     68 	// GCC >= 4.8 and Clang
     69 #	define byteswap16(num) __builtin_bswap16(num)
     70 #	define byteswap32(num) __builtin_bswap32(num)
     71 #	define byteswap64(num) __builtin_bswap64(num)
     72 
     73 #elif defined(HAVE_BYTESWAP_H)
     74 	// glibc, uClibc, dietlibc
     75 #	include <byteswap.h>
     76 #	ifdef HAVE_BSWAP_16
     77 #		define byteswap16(num) bswap_16(num)
     78 #	endif
     79 #	ifdef HAVE_BSWAP_32
     80 #		define byteswap32(num) bswap_32(num)
     81 #	endif
     82 #	ifdef HAVE_BSWAP_64
     83 #		define byteswap64(num) bswap_64(num)
     84 #	endif
     85 
     86 #elif defined(HAVE_SYS_ENDIAN_H)
     87 	// *BSDs and Darwin
     88 #	include <sys/endian.h>
     89 #	ifdef __OpenBSD__
     90 #		define byteswap16(num) swap16(num)
     91 #		define byteswap32(num) swap32(num)
     92 #		define byteswap64(num) swap64(num)
     93 #	else
     94 #		define byteswap16(num) bswap16(num)
     95 #		define byteswap32(num) bswap32(num)
     96 #		define byteswap64(num) bswap64(num)
     97 #	endif
     98 
     99 #elif defined(HAVE_SYS_BYTEORDER_H)
    100 	// Solaris
    101 #	include <sys/byteorder.h>
    102 #	ifdef BSWAP_16
    103 #		define byteswap16(num) BSWAP_16(num)
    104 #	endif
    105 #	ifdef BSWAP_32
    106 #		define byteswap32(num) BSWAP_32(num)
    107 #	endif
    108 #	ifdef BSWAP_64
    109 #		define byteswap64(num) BSWAP_64(num)
    110 #	endif
    111 #	ifdef BE_16
    112 #		define conv16be(num) BE_16(num)
    113 #	endif
    114 #	ifdef BE_32
    115 #		define conv32be(num) BE_32(num)
    116 #	endif
    117 #	ifdef BE_64
    118 #		define conv64be(num) BE_64(num)
    119 #	endif
    120 #	ifdef LE_16
    121 #		define conv16le(num) LE_16(num)
    122 #	endif
    123 #	ifdef LE_32
    124 #		define conv32le(num) LE_32(num)
    125 #	endif
    126 #	ifdef LE_64
    127 #		define conv64le(num) LE_64(num)
    128 #	endif
    129 #endif
    130 
    131 #ifndef byteswap16
    132 #	define byteswap16(n) (uint16_t)( \
    133 		  (((n) & 0x00FFU) << 8) \
    134 		| (((n) & 0xFF00U) >> 8) \
    135 	)
    136 #endif
    137 
    138 #ifndef byteswap32
    139 #	define byteswap32(n) (uint32_t)( \
    140 		  (((n) & UINT32_C(0x000000FF)) << 24) \
    141 		| (((n) & UINT32_C(0x0000FF00)) << 8) \
    142 		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
    143 		| (((n) & UINT32_C(0xFF000000)) >> 24) \
    144 	)
    145 #endif
    146 
    147 #ifndef byteswap64
    148 #	define byteswap64(n) (uint64_t)( \
    149 		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
    150 		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
    151 		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
    152 		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
    153 		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
    154 		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
    155 		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
    156 		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
    157 	)
    158 #endif
    159 
    160 // Define conversion macros using the basic byte swapping macros.
    161 #ifdef WORDS_BIGENDIAN
    162 #	ifndef conv16be
    163 #		define conv16be(num) ((uint16_t)(num))
    164 #	endif
    165 #	ifndef conv32be
    166 #		define conv32be(num) ((uint32_t)(num))
    167 #	endif
    168 #	ifndef conv64be
    169 #		define conv64be(num) ((uint64_t)(num))
    170 #	endif
    171 #	ifndef conv16le
    172 #		define conv16le(num) byteswap16(num)
    173 #	endif
    174 #	ifndef conv32le
    175 #		define conv32le(num) byteswap32(num)
    176 #	endif
    177 #	ifndef conv64le
    178 #		define conv64le(num) byteswap64(num)
    179 #	endif
    180 #else
    181 #	ifndef conv16be
    182 #		define conv16be(num) byteswap16(num)
    183 #	endif
    184 #	ifndef conv32be
    185 #		define conv32be(num) byteswap32(num)
    186 #	endif
    187 #	ifndef conv64be
    188 #		define conv64be(num) byteswap64(num)
    189 #	endif
    190 #	ifndef conv16le
    191 #		define conv16le(num) ((uint16_t)(num))
    192 #	endif
    193 #	ifndef conv32le
    194 #		define conv32le(num) ((uint32_t)(num))
    195 #	endif
    196 #	ifndef conv64le
    197 #		define conv64le(num) ((uint64_t)(num))
    198 #	endif
    199 #endif
    200 
    201 
    202 ////////////////////////////////
    203 // Unaligned reads and writes //
    204 ////////////////////////////////
    205 
    206 // No-strict-align archs like x86-64
    207 // ---------------------------------
    208 //
    209 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
    210 // is bad even if the uint8_pointer is properly aligned because this kind
    211 // of casts break strict aliasing rules and result in undefined behavior.
    212 // With unaligned pointers it's even worse: compilers may emit vector
    213 // instructions that require aligned pointers even if non-vector
    214 // instructions work with unaligned pointers.
    215 //
    216 // Using memcpy() is the standard compliant way to do unaligned access.
    217 // Many modern compilers inline it so there is no function call overhead.
    218 // For those compilers that don't handle the memcpy() method well, the
    219 // old casting method (that violates strict aliasing) can be requested at
    220 // build time. A third method, casting to a packed struct, would also be
    221 // an option but isn't provided to keep things simpler (it's already a mess).
    222 // Hopefully this is flexible enough in practice.
    223 //
    224 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
    225 //
    226 //     buf[0] | (buf[1] << 8)
    227 //
    228 // reads a 16-bit value and can emit a single 16-bit load and produce
    229 // identical code than with the memcpy() method. In other cases Clang and GCC
    230 // produce either the same or better code with memcpy(). For example, Clang 9
    231 // on x86-64 can detect 32-bit load but not 16-bit load.
    232 //
    233 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
    234 // code for "buf[0] | (buf[1] << 8)".
    235 //
    236 // Conclusion: The memcpy() method is the best choice when unaligned access
    237 // is supported.
    238 //
    239 // Strict-align archs like SPARC
    240 // -----------------------------
    241 //
    242 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
    243 // from the memcpy() method than from simple byte-by-byte shift-or code
    244 // when reading a 32-bit integer:
    245 //
    246 //     (1) It may be constructed on stack using four 8-bit loads,
    247 //         four 8-bit stores to stack, and finally one 32-bit load from stack.
    248 //
    249 //     (2) Especially with -Os, an actual memcpy() call may be emitted.
    250 //
    251 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
    252 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
    253 // some processors but not all so this is relevant only in the case when
    254 // GCC assumes that unaligned is not supported or -mstrict-align or
    255 // -mno-unaligned-access is used.
    256 //
    257 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
    258 // was one the very few with a minor difference: the memcpy() version
    259 // was one instruction longer.
    260 //
    261 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
    262 // the best choice for strict-align archs to do unaligned access.
    263 //
    264 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
    265 //
    266 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
    267 // The following is for little endian targets:
    268 /*
    269 #include <stdint.h>
    270 #include <string.h>
    271 
    272 uint32_t bytes16(const uint8_t *b)
    273 {
    274     return (uint32_t)b[0]
    275         | ((uint32_t)b[1] << 8);
    276 }
    277 
    278 uint32_t copy16(const uint8_t *b)
    279 {
    280     uint16_t v;
    281     memcpy(&v, b, sizeof(v));
    282     return v;
    283 }
    284 
    285 uint32_t bytes32(const uint8_t *b)
    286 {
    287     return (uint32_t)b[0]
    288         | ((uint32_t)b[1] << 8)
    289         | ((uint32_t)b[2] << 16)
    290         | ((uint32_t)b[3] << 24);
    291 }
    292 
    293 uint32_t copy32(const uint8_t *b)
    294 {
    295     uint32_t v;
    296     memcpy(&v, b, sizeof(v));
    297     return v;
    298 }
    299 
    300 void wbytes16(uint8_t *b, uint16_t v)
    301 {
    302     b[0] = (uint8_t)v;
    303     b[1] = (uint8_t)(v >> 8);
    304 }
    305 
    306 void wcopy16(uint8_t *b, uint16_t v)
    307 {
    308     memcpy(b, &v, sizeof(v));
    309 }
    310 
    311 void wbytes32(uint8_t *b, uint32_t v)
    312 {
    313     b[0] = (uint8_t)v;
    314     b[1] = (uint8_t)(v >> 8);
    315     b[2] = (uint8_t)(v >> 16);
    316     b[3] = (uint8_t)(v >> 24);
    317 }
    318 
    319 void wcopy32(uint8_t *b, uint32_t v)
    320 {
    321     memcpy(b, &v, sizeof(v));
    322 }
    323 */
    324 
    325 
    326 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
    327 
    328 static inline uint16_t
    329 read16ne(const uint8_t *buf)
    330 {
    331 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
    332 	return *(const uint16_t *)buf;
    333 #else
    334 	uint16_t num;
    335 	memcpy(&num, buf, sizeof(num));
    336 	return num;
    337 #endif
    338 }
    339 
    340 
    341 static inline uint32_t
    342 read32ne(const uint8_t *buf)
    343 {
    344 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
    345 	return *(const uint32_t *)buf;
    346 #else
    347 	uint32_t num;
    348 	memcpy(&num, buf, sizeof(num));
    349 	return num;
    350 #endif
    351 }
    352 
    353 
    354 static inline uint64_t
    355 read64ne(const uint8_t *buf)
    356 {
    357 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
    358 	return *(const uint64_t *)buf;
    359 #else
    360 	uint64_t num;
    361 	memcpy(&num, buf, sizeof(num));
    362 	return num;
    363 #endif
    364 }
    365 
    366 
    367 static inline void
    368 write16ne(uint8_t *buf, uint16_t num)
    369 {
    370 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
    371 	*(uint16_t *)buf = num;
    372 #else
    373 	memcpy(buf, &num, sizeof(num));
    374 #endif
    375 	return;
    376 }
    377 
    378 
    379 static inline void
    380 write32ne(uint8_t *buf, uint32_t num)
    381 {
    382 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
    383 	*(uint32_t *)buf = num;
    384 #else
    385 	memcpy(buf, &num, sizeof(num));
    386 #endif
    387 	return;
    388 }
    389 
    390 
    391 static inline void
    392 write64ne(uint8_t *buf, uint64_t num)
    393 {
    394 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
    395 	*(uint64_t *)buf = num;
    396 #else
    397 	memcpy(buf, &num, sizeof(num));
    398 #endif
    399 	return;
    400 }
    401 
    402 
    403 static inline uint16_t
    404 read16be(const uint8_t *buf)
    405 {
    406 	uint16_t num = read16ne(buf);
    407 	return conv16be(num);
    408 }
    409 
    410 
    411 static inline uint16_t
    412 read16le(const uint8_t *buf)
    413 {
    414 	uint16_t num = read16ne(buf);
    415 	return conv16le(num);
    416 }
    417 
    418 
    419 static inline uint32_t
    420 read32be(const uint8_t *buf)
    421 {
    422 	uint32_t num = read32ne(buf);
    423 	return conv32be(num);
    424 }
    425 
    426 
    427 static inline uint32_t
    428 read32le(const uint8_t *buf)
    429 {
    430 	uint32_t num = read32ne(buf);
    431 	return conv32le(num);
    432 }
    433 
    434 
    435 static inline uint64_t
    436 read64be(const uint8_t *buf)
    437 {
    438 	uint64_t num = read64ne(buf);
    439 	return conv64be(num);
    440 }
    441 
    442 
    443 static inline uint64_t
    444 read64le(const uint8_t *buf)
    445 {
    446 	uint64_t num = read64ne(buf);
    447 	return conv64le(num);
    448 }
    449 
    450 
    451 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
    452 // to optimize byte swapping of constants when using glibc's or *BSD's
    453 // byte swapping macros. The actual write is done in an inline function
    454 // to make type checking of the buf pointer possible.
    455 #define write16be(buf, num) write16ne(buf, conv16be(num))
    456 #define write32be(buf, num) write32ne(buf, conv32be(num))
    457 #define write64be(buf, num) write64ne(buf, conv64be(num))
    458 #define write16le(buf, num) write16ne(buf, conv16le(num))
    459 #define write32le(buf, num) write32ne(buf, conv32le(num))
    460 #define write64le(buf, num) write64ne(buf, conv64le(num))
    461 
    462 #else
    463 
    464 #ifdef WORDS_BIGENDIAN
    465 #	define read16ne read16be
    466 #	define read32ne read32be
    467 #	define read64ne read64be
    468 #	define write16ne write16be
    469 #	define write32ne write32be
    470 #	define write64ne write64be
    471 #else
    472 #	define read16ne read16le
    473 #	define read32ne read32le
    474 #	define read64ne read64le
    475 #	define write16ne write16le
    476 #	define write32ne write32le
    477 #	define write64ne write64le
    478 #endif
    479 
    480 
    481 static inline uint16_t
    482 read16be(const uint8_t *buf)
    483 {
    484 	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
    485 	return num;
    486 }
    487 
    488 
    489 static inline uint16_t
    490 read16le(const uint8_t *buf)
    491 {
    492 	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
    493 	return num;
    494 }
    495 
    496 
    497 static inline uint32_t
    498 read32be(const uint8_t *buf)
    499 {
    500 	uint32_t num = (uint32_t)buf[0] << 24;
    501 	num |= (uint32_t)buf[1] << 16;
    502 	num |= (uint32_t)buf[2] << 8;
    503 	num |= (uint32_t)buf[3];
    504 	return num;
    505 }
    506 
    507 
    508 static inline uint32_t
    509 read32le(const uint8_t *buf)
    510 {
    511 	uint32_t num = (uint32_t)buf[0];
    512 	num |= (uint32_t)buf[1] << 8;
    513 	num |= (uint32_t)buf[2] << 16;
    514 	num |= (uint32_t)buf[3] << 24;
    515 	return num;
    516 }
    517 
    518 
    519 static inline uint64_t
    520 read64be(const uint8_t *buf)
    521 {
    522 	uint64_t num = (uint64_t)buf[0] << 56;
    523 	num |= (uint64_t)buf[1] << 48;
    524 	num |= (uint64_t)buf[2] << 40;
    525 	num |= (uint64_t)buf[3] << 32;
    526 	num |= (uint64_t)buf[4] << 24;
    527 	num |= (uint64_t)buf[5] << 16;
    528 	num |= (uint64_t)buf[6] << 8;
    529 	num |= (uint64_t)buf[7];
    530 	return num;
    531 }
    532 
    533 
    534 static inline uint64_t
    535 read64le(const uint8_t *buf)
    536 {
    537 	uint64_t num = (uint64_t)buf[0];
    538 	num |= (uint64_t)buf[1] << 8;
    539 	num |= (uint64_t)buf[2] << 16;
    540 	num |= (uint64_t)buf[3] << 24;
    541 	num |= (uint64_t)buf[4] << 32;
    542 	num |= (uint64_t)buf[5] << 40;
    543 	num |= (uint64_t)buf[6] << 48;
    544 	num |= (uint64_t)buf[7] << 56;
    545 	return num;
    546 }
    547 
    548 
    549 static inline void
    550 write16be(uint8_t *buf, uint16_t num)
    551 {
    552 	buf[0] = (uint8_t)(num >> 8);
    553 	buf[1] = (uint8_t)num;
    554 	return;
    555 }
    556 
    557 
    558 static inline void
    559 write16le(uint8_t *buf, uint16_t num)
    560 {
    561 	buf[0] = (uint8_t)num;
    562 	buf[1] = (uint8_t)(num >> 8);
    563 	return;
    564 }
    565 
    566 
    567 static inline void
    568 write32be(uint8_t *buf, uint32_t num)
    569 {
    570 	buf[0] = (uint8_t)(num >> 24);
    571 	buf[1] = (uint8_t)(num >> 16);
    572 	buf[2] = (uint8_t)(num >> 8);
    573 	buf[3] = (uint8_t)num;
    574 	return;
    575 }
    576 
    577 
    578 static inline void
    579 write32le(uint8_t *buf, uint32_t num)
    580 {
    581 	buf[0] = (uint8_t)num;
    582 	buf[1] = (uint8_t)(num >> 8);
    583 	buf[2] = (uint8_t)(num >> 16);
    584 	buf[3] = (uint8_t)(num >> 24);
    585 	return;
    586 }
    587 
    588 
    589 static inline void
    590 write64be(uint8_t *buf, uint64_t num)
    591 {
    592 	buf[0] = (uint8_t)(num >> 56);
    593 	buf[1] = (uint8_t)(num >> 48);
    594 	buf[2] = (uint8_t)(num >> 40);
    595 	buf[3] = (uint8_t)(num >> 32);
    596 	buf[4] = (uint8_t)(num >> 24);
    597 	buf[5] = (uint8_t)(num >> 16);
    598 	buf[6] = (uint8_t)(num >> 8);
    599 	buf[7] = (uint8_t)num;
    600 	return;
    601 }
    602 
    603 
    604 static inline void
    605 write64le(uint8_t *buf, uint64_t num)
    606 {
    607 	buf[0] = (uint8_t)num;
    608 	buf[1] = (uint8_t)(num >> 8);
    609 	buf[2] = (uint8_t)(num >> 16);
    610 	buf[3] = (uint8_t)(num >> 24);
    611 	buf[4] = (uint8_t)(num >> 32);
    612 	buf[5] = (uint8_t)(num >> 40);
    613 	buf[6] = (uint8_t)(num >> 48);
    614 	buf[7] = (uint8_t)(num >> 56);
    615 	return;
    616 }
    617 
    618 #endif
    619 
    620 
    621 //////////////////////////////
    622 // Aligned reads and writes //
    623 //////////////////////////////
    624 
    625 // Separate functions for aligned reads and writes are provided since on
    626 // strict-align archs aligned access is much faster than unaligned access.
    627 //
    628 // Just like in the unaligned case, memcpy() is needed to avoid
    629 // strict aliasing violations. However, on archs that don't support
    630 // unaligned access the compiler cannot know that the pointers given
    631 // to memcpy() are aligned which results in slow code. As of C11 there is
    632 // no standard way to tell the compiler that we know that the address is
    633 // aligned but some compilers have language extensions to do that. With
    634 // such language extensions the memcpy() method gives excellent results.
    635 //
    636 // What to do on a strict-align system when no known language extensions
    637 // are available? Falling back to byte-by-byte access would be safe but ruin
    638 // optimizations that have been made specifically with aligned access in mind.
    639 // As a compromise, aligned reads will fall back to non-compliant type punning
    640 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
    641 // over fast writes. This obviously isn't great but hopefully it's a working
    642 // compromise for now.
    643 //
    644 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
    645 #ifdef __lint__
    646 #	define tuklib_assume_aligned(ptr, align) (const void *)(ptr)
    647 #else
    648 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
    649 #	define tuklib_assume_aligned(ptr, align) \
    650 		__builtin_assume_aligned(ptr, align)
    651 #else
    652 #	define tuklib_assume_aligned(ptr, align) (ptr)
    653 #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
    654 #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
    655 #	endif
    656 #endif
    657 #endif
    658 
    659 
    660 static inline uint16_t
    661 aligned_read16ne(const uint8_t *buf)
    662 {
    663 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
    664 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
    665 	return *(const uint16_t *)buf;
    666 #else
    667 	uint16_t num;
    668 	memcpy(&num, tuklib_assume_aligned(buf, sizeof(num)), sizeof(num));
    669 	return num;
    670 #endif
    671 }
    672 
    673 
    674 static inline uint32_t
    675 aligned_read32ne(const uint8_t *buf)
    676 {
    677 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
    678 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
    679 	return *(const uint32_t *)buf;
    680 #else
    681 	uint32_t num;
    682 	memcpy(&num, tuklib_assume_aligned(buf, sizeof(num)), sizeof(num));
    683 	return num;
    684 #endif
    685 }
    686 
    687 
    688 static inline uint64_t
    689 aligned_read64ne(const uint8_t *buf)
    690 {
    691 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
    692 		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
    693 	return *(const uint64_t *)buf;
    694 #else
    695 	uint64_t num;
    696 	memcpy(&num, tuklib_assume_aligned(buf, sizeof(num)), sizeof(num));
    697 	return num;
    698 #endif
    699 }
    700 
    701 
    702 static inline void
    703 aligned_write16ne(uint8_t *buf, uint16_t num)
    704 {
    705 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
    706 	*(uint16_t *)buf = num;
    707 #else
    708 	memcpy(tuklib_assume_aligned(buf, sizeof(num)), &num, sizeof(num));
    709 #endif
    710 	return;
    711 }
    712 
    713 
    714 static inline void
    715 aligned_write32ne(uint8_t *buf, uint32_t num)
    716 {
    717 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
    718 	*(uint32_t *)buf = num;
    719 #else
    720 	memcpy(tuklib_assume_aligned(buf, sizeof(num)), &num, sizeof(num));
    721 #endif
    722 	return;
    723 }
    724 
    725 
    726 static inline void
    727 aligned_write64ne(uint8_t *buf, uint64_t num)
    728 {
    729 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
    730 	*(uint64_t *)buf = num;
    731 #else
    732 	memcpy(tuklib_assume_aligned(buf, sizeof(num)), &num, sizeof(num));
    733 #endif
    734 	return;
    735 }
    736 
    737 
    738 static inline uint16_t
    739 aligned_read16be(const uint8_t *buf)
    740 {
    741 	uint16_t num = aligned_read16ne(buf);
    742 	return conv16be(num);
    743 }
    744 
    745 
    746 static inline uint16_t
    747 aligned_read16le(const uint8_t *buf)
    748 {
    749 	uint16_t num = aligned_read16ne(buf);
    750 	return conv16le(num);
    751 }
    752 
    753 
    754 static inline uint32_t
    755 aligned_read32be(const uint8_t *buf)
    756 {
    757 	uint32_t num = aligned_read32ne(buf);
    758 	return conv32be(num);
    759 }
    760 
    761 
    762 static inline uint32_t
    763 aligned_read32le(const uint8_t *buf)
    764 {
    765 	uint32_t num = aligned_read32ne(buf);
    766 	return conv32le(num);
    767 }
    768 
    769 
    770 static inline uint64_t
    771 aligned_read64be(const uint8_t *buf)
    772 {
    773 	uint64_t num = aligned_read64ne(buf);
    774 	return conv64be(num);
    775 }
    776 
    777 
    778 static inline uint64_t
    779 aligned_read64le(const uint8_t *buf)
    780 {
    781 	uint64_t num = aligned_read64ne(buf);
    782 	return conv64le(num);
    783 }
    784 
    785 
    786 // These need to be macros like in the unaligned case.
    787 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
    788 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
    789 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
    790 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
    791 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
    792 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
    793 
    794 
    795 ////////////////////
    796 // Bit operations //
    797 ////////////////////
    798 
    799 static inline uint32_t
    800 bsr32(uint32_t n)
    801 {
    802 	// Check for ICC first, since it tends to define __GNUC__ too.
    803 #if defined(__INTEL_COMPILER)
    804 	return _bit_scan_reverse(n);
    805 
    806 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
    807 	// GCC >= 3.4 has __builtin_clz(), which gives good results on
    808 	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
    809 	// either plain BSR (so the XOR gets optimized away) or LZCNT and
    810 	// XOR (if -march indicates that SSE4a instructions are supported).
    811 	return (uint32_t)__builtin_clz(n) ^ 31U;
    812 
    813 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    814 	uint32_t i;
    815 	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
    816 	return i;
    817 
    818 #elif defined(_MSC_VER)
    819 	unsigned long i;
    820 	_BitScanReverse(&i, n);
    821 	return i;
    822 
    823 #else
    824 	uint32_t i = 31;
    825 
    826 	if ((n & 0xFFFF0000) == 0) {
    827 		n <<= 16;
    828 		i = 15;
    829 	}
    830 
    831 	if ((n & 0xFF000000) == 0) {
    832 		n <<= 8;
    833 		i -= 8;
    834 	}
    835 
    836 	if ((n & 0xF0000000) == 0) {
    837 		n <<= 4;
    838 		i -= 4;
    839 	}
    840 
    841 	if ((n & 0xC0000000) == 0) {
    842 		n <<= 2;
    843 		i -= 2;
    844 	}
    845 
    846 	if ((n & 0x80000000) == 0)
    847 		--i;
    848 
    849 	return i;
    850 #endif
    851 }
    852 
    853 
    854 static inline uint32_t
    855 clz32(uint32_t n)
    856 {
    857 #if defined(__INTEL_COMPILER)
    858 	return _bit_scan_reverse(n) ^ 31U;
    859 
    860 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
    861 	return (uint32_t)__builtin_clz(n);
    862 
    863 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    864 	uint32_t i;
    865 	__asm__("bsrl %1, %0\n\t"
    866 		"xorl $31, %0"
    867 		: "=r" (i) : "rm" (n));
    868 	return i;
    869 
    870 #elif defined(_MSC_VER)
    871 	unsigned long i;
    872 	_BitScanReverse(&i, n);
    873 	return i ^ 31U;
    874 
    875 #else
    876 	uint32_t i = 0;
    877 
    878 	if ((n & 0xFFFF0000) == 0) {
    879 		n <<= 16;
    880 		i = 16;
    881 	}
    882 
    883 	if ((n & 0xFF000000) == 0) {
    884 		n <<= 8;
    885 		i += 8;
    886 	}
    887 
    888 	if ((n & 0xF0000000) == 0) {
    889 		n <<= 4;
    890 		i += 4;
    891 	}
    892 
    893 	if ((n & 0xC0000000) == 0) {
    894 		n <<= 2;
    895 		i += 2;
    896 	}
    897 
    898 	if ((n & 0x80000000) == 0)
    899 		++i;
    900 
    901 	return i;
    902 #endif
    903 }
    904 
    905 
    906 static inline uint32_t
    907 ctz32(uint32_t n)
    908 {
    909 #if defined(__INTEL_COMPILER)
    910 	return _bit_scan_forward(n);
    911 
    912 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
    913 	return (uint32_t)__builtin_ctz(n);
    914 
    915 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    916 	uint32_t i;
    917 	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
    918 	return i;
    919 
    920 #elif defined(_MSC_VER)
    921 	unsigned long i;
    922 	_BitScanForward(&i, n);
    923 	return i;
    924 
    925 #else
    926 	uint32_t i = 0;
    927 
    928 	if ((n & 0x0000FFFF) == 0) {
    929 		n >>= 16;
    930 		i = 16;
    931 	}
    932 
    933 	if ((n & 0x000000FF) == 0) {
    934 		n >>= 8;
    935 		i += 8;
    936 	}
    937 
    938 	if ((n & 0x0000000F) == 0) {
    939 		n >>= 4;
    940 		i += 4;
    941 	}
    942 
    943 	if ((n & 0x00000003) == 0) {
    944 		n >>= 2;
    945 		i += 2;
    946 	}
    947 
    948 	if ((n & 0x00000001) == 0)
    949 		++i;
    950 
    951 	return i;
    952 #endif
    953 }
    954 
    955 #define bsf32 ctz32
    956 
    957 #endif
    958