Home | History | Annotate | Line # | Download | only in common
      1 ///////////////////////////////////////////////////////////////////////////////
      2 //
      3 /// \file       memcmplen.h
      4 /// \brief      Optimized comparison of two buffers
      5 //
      6 //  Author:     Lasse Collin
      7 //
      8 //  This file has been put into the public domain.
      9 //  You can do whatever you want with this file.
     10 //
     11 ///////////////////////////////////////////////////////////////////////////////
     12 
     13 #ifndef LZMA_MEMCMPLEN_H
     14 #define LZMA_MEMCMPLEN_H
     15 
     16 #include "common.h"
     17 
     18 #ifdef HAVE_IMMINTRIN_H
     19 #	include <immintrin.h>
     20 #endif
     21 
     22 
     23 /// Find out how many equal bytes the two buffers have.
     24 ///
     25 /// \param      buf1    First buffer
     26 /// \param      buf2    Second buffer
     27 /// \param      len     How many bytes have already been compared and will
     28 ///                     be assumed to match
     29 /// \param      limit   How many bytes to compare at most, including the
     30 ///                     already-compared bytes. This must be significantly
     31 ///                     smaller than UINT32_MAX to avoid integer overflows.
     32 ///                     Up to LZMA_MEMCMPLEN_EXTRA bytes may be read past
     33 ///                     the specified limit from both buf1 and buf2.
     34 ///
     35 /// \return     Number of equal bytes in the buffers is returned.
     36 ///             This is always at least len and at most limit.
     37 ///
     38 /// \note       LZMA_MEMCMPLEN_EXTRA defines how many extra bytes may be read.
     39 ///             It's rounded up to 2^n. This extra amount needs to be
     40 ///             allocated in the buffers being used. It needs to be
     41 ///             initialized too to keep Valgrind quiet.
     42 static inline uint32_t lzma_attribute((__always_inline__))
     43 lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2,
     44 		uint32_t len, uint32_t limit)
     45 {
     46 	assert(len <= limit);
     47 	assert(limit <= UINT32_MAX / 2);
     48 
     49 #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
     50 		&& ((TUKLIB_GNUC_REQ(3, 4) && defined(__x86_64__)) \
     51 			|| (defined(__INTEL_COMPILER) && defined(__x86_64__)) \
     52 			|| (defined(__INTEL_COMPILER) && defined(_M_X64)) \
     53 			|| (defined(_MSC_VER) && defined(_M_X64)))
     54 	// NOTE: This will use 64-bit unaligned access which
     55 	// TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit, but
     56 	// it's convenient here at least as long as it's x86-64 only.
     57 	//
     58 	// I keep this x86-64 only for now since that's where I know this
     59 	// to be a good method. This may be fine on other 64-bit CPUs too.
     60 	// On big endian one should use xor instead of subtraction and switch
     61 	// to __builtin_clzll().
     62 #define LZMA_MEMCMPLEN_EXTRA 8
     63 	while (len < limit) {
     64 		const uint64_t x = *(const uint64_t *)(buf1 + len)
     65 				- *(const uint64_t *)(buf2 + len);
     66 		if (x != 0) {
     67 #	if defined(_M_X64) // MSVC or Intel C compiler on Windows
     68 			unsigned long tmp;
     69 			_BitScanForward64(&tmp, x);
     70 			len += (uint32_t)tmp >> 3;
     71 #	else // GCC, clang, or Intel C compiler
     72 			len += (uint32_t)__builtin_ctzll(x) >> 3;
     73 #	endif
     74 			return my_min(len, limit);
     75 		}
     76 
     77 		len += 8;
     78 	}
     79 
     80 	return limit;
     81 
     82 #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
     83 		&& defined(HAVE__MM_MOVEMASK_EPI8) \
     84 		&& ((defined(__GNUC__) && defined(__SSE2_MATH__)) \
     85 			|| (defined(__INTEL_COMPILER) && defined(__SSE2__)) \
     86 			|| (defined(_MSC_VER) && defined(_M_IX86_FP) \
     87 				&& _M_IX86_FP >= 2))
     88 	// NOTE: Like above, this will use 128-bit unaligned access which
     89 	// TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit.
     90 	//
     91 	// SSE2 version for 32-bit and 64-bit x86. On x86-64 the above
     92 	// version is sometimes significantly faster and sometimes
     93 	// slightly slower than this SSE2 version, so this SSE2
     94 	// version isn't used on x86-64.
     95 #	define LZMA_MEMCMPLEN_EXTRA 16
     96 	while (len < limit) {
     97 		const uint32_t x = 0xFFFF ^ _mm_movemask_epi8(_mm_cmpeq_epi8(
     98 			_mm_loadu_si128((const __m128i *)(buf1 + len)),
     99 			_mm_loadu_si128((const __m128i *)(buf2 + len))));
    100 
    101 		if (x != 0) {
    102 #	if defined(__INTEL_COMPILER)
    103 			len += _bit_scan_forward(x);
    104 #	elif defined(_MSC_VER)
    105 			unsigned long tmp;
    106 			_BitScanForward(&tmp, x);
    107 			len += tmp;
    108 #	else
    109 			len += __builtin_ctz(x);
    110 #	endif
    111 			return my_min(len, limit);
    112 		}
    113 
    114 		len += 16;
    115 	}
    116 
    117 	return limit;
    118 
    119 #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && !defined(WORDS_BIGENDIAN)
    120 	// Generic 32-bit little endian method
    121 #	define LZMA_MEMCMPLEN_EXTRA 4
    122 	while (len < limit) {
    123 		uint32_t x = *(const uint32_t *)(buf1 + len)
    124 				- *(const uint32_t *)(buf2 + len);
    125 		if (x != 0) {
    126 			if ((x & 0xFFFF) == 0) {
    127 				len += 2;
    128 				x >>= 16;
    129 			}
    130 
    131 			if ((x & 0xFF) == 0)
    132 				++len;
    133 
    134 			return my_min(len, limit);
    135 		}
    136 
    137 		len += 4;
    138 	}
    139 
    140 	return limit;
    141 
    142 #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && defined(WORDS_BIGENDIAN)
    143 	// Generic 32-bit big endian method
    144 #	define LZMA_MEMCMPLEN_EXTRA 4
    145 	while (len < limit) {
    146 		uint32_t x = *(const uint32_t *)(buf1 + len)
    147 				^ *(const uint32_t *)(buf2 + len);
    148 		if (x != 0) {
    149 			if ((x & 0xFFFF0000) == 0) {
    150 				len += 2;
    151 				x <<= 16;
    152 			}
    153 
    154 			if ((x & 0xFF000000) == 0)
    155 				++len;
    156 
    157 			return my_min(len, limit);
    158 		}
    159 
    160 		len += 4;
    161 	}
    162 
    163 	return limit;
    164 
    165 #else
    166 	// Simple portable version that doesn't use unaligned access.
    167 #	define LZMA_MEMCMPLEN_EXTRA 0
    168 	while (len < limit && buf1[len] == buf2[len])
    169 		++len;
    170 
    171 	return len;
    172 #endif
    173 }
    174 
    175 #endif
    176