Home | History | Annotate | Line # | Download | only in enc
      1 /* Copyright 2010 Google Inc. All Rights Reserved.
      2 
      3    Distributed under MIT license.
      4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      5 */
      6 
      7 /* A (forgetful) hash table to the data seen by the compressor, to
      8    help create backward references to previous data. */
      9 
     10 #ifndef BROTLI_ENC_HASH_H_
     11 #define BROTLI_ENC_HASH_H_
     12 
     13 #include <string.h>  /* memcmp, memset */
     14 
     15 #include "../common/constants.h"
     16 #include "../common/dictionary.h"
     17 #include "../common/platform.h"
     18 #include <brotli/types.h>
     19 #include "./encoder_dict.h"
     20 #include "./fast_log.h"
     21 #include "./find_match_length.h"
     22 #include "./memory.h"
     23 #include "./quality.h"
     24 #include "./static_dict.h"
     25 
     26 #if defined(__cplusplus) || defined(c_plusplus)
     27 extern "C" {
     28 #endif
     29 
     30 typedef struct {
     31   /* Dynamically allocated area; first member for quickest access. */
     32   void* extra;
     33 
     34   size_t dict_num_lookups;
     35   size_t dict_num_matches;
     36 
     37   BrotliHasherParams params;
     38 
     39   /* False if hasher needs to be "prepared" before use. */
     40   BROTLI_BOOL is_prepared_;
     41 } HasherCommon;
     42 
     43 #define score_t size_t
     44 
     45 static const uint32_t kCutoffTransformsCount = 10;
     46 /*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
     47 /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
     48 static const uint64_t kCutoffTransforms =
     49     BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
     50 
     51 typedef struct HasherSearchResult {
     52   size_t len;
     53   size_t distance;
     54   score_t score;
     55   int len_code_delta; /* == len_code - len */
     56 } HasherSearchResult;
     57 
     58 /* kHashMul32 multiplier has these properties:
     59    * The multiplier must be odd. Otherwise we may lose the highest bit.
     60    * No long streaks of ones or zeros.
     61    * There is no effort to ensure that it is a prime, the oddity is enough
     62      for this use.
     63    * The number has been tuned heuristically against compression benchmarks. */
     64 static const uint32_t kHashMul32 = 0x1E35A7BD;
     65 static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1E35A7BD, 0x1E35A7BD);
     66 static const uint64_t kHashMul64Long =
     67     BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u);
     68 
     69 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
     70   uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
     71   /* The higher bits contain more mixture from the multiplication,
     72      so we take our results from there. */
     73   return h >> (32 - 14);
     74 }
     75 
     76 static BROTLI_INLINE void PrepareDistanceCache(
     77     int* BROTLI_RESTRICT distance_cache, const int num_distances) {
     78   if (num_distances > 4) {
     79     int last_distance = distance_cache[0];
     80     distance_cache[4] = last_distance - 1;
     81     distance_cache[5] = last_distance + 1;
     82     distance_cache[6] = last_distance - 2;
     83     distance_cache[7] = last_distance + 2;
     84     distance_cache[8] = last_distance - 3;
     85     distance_cache[9] = last_distance + 3;
     86     if (num_distances > 10) {
     87       int next_last_distance = distance_cache[1];
     88       distance_cache[10] = next_last_distance - 1;
     89       distance_cache[11] = next_last_distance + 1;
     90       distance_cache[12] = next_last_distance - 2;
     91       distance_cache[13] = next_last_distance + 2;
     92       distance_cache[14] = next_last_distance - 3;
     93       distance_cache[15] = next_last_distance + 3;
     94     }
     95   }
     96 }
     97 
     98 #define BROTLI_LITERAL_BYTE_SCORE 135
     99 #define BROTLI_DISTANCE_BIT_PENALTY 30
    100 /* Score must be positive after applying maximal penalty. */
    101 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
    102 
    103 /* Usually, we always choose the longest backward reference. This function
    104    allows for the exception of that rule.
    105 
    106    If we choose a backward reference that is further away, it will
    107    usually be coded with more bits. We approximate this by assuming
    108    log2(distance). If the distance can be expressed in terms of the
    109    last four distances, we use some heuristic constants to estimate
    110    the bits cost. For the first up to four literals we use the bit
    111    cost of the literals from the literal cost model, after that we
    112    use the average bit cost of the cost model.
    113 
    114    This function is used to sometimes discard a longer backward reference
    115    when it is not much longer and the bit cost for encoding it is more
    116    than the saved literals.
    117 
    118    backward_reference_offset MUST be positive. */
    119 static BROTLI_INLINE score_t BackwardReferenceScore(
    120     size_t copy_length, size_t backward_reference_offset) {
    121   return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
    122       BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
    123 }
    124 
    125 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
    126     size_t copy_length) {
    127   return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
    128       BROTLI_SCORE_BASE + 15;
    129 }
    130 
    131 static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
    132     size_t distance_short_code) {
    133   return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
    134 }
    135 
    136 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
    137     const BrotliEncoderDictionary* dictionary, size_t len, size_t word_idx,
    138     const uint8_t* data, size_t max_length, size_t max_backward,
    139     size_t max_distance, HasherSearchResult* out) {
    140   size_t offset;
    141   size_t matchlen;
    142   size_t backward;
    143   score_t score;
    144   offset = dictionary->words->offsets_by_length[len] + len * word_idx;
    145   if (len > max_length) {
    146     return BROTLI_FALSE;
    147   }
    148 
    149   matchlen =
    150       FindMatchLengthWithLimit(data, &dictionary->words->data[offset], len);
    151   if (matchlen + dictionary->cutoffTransformsCount <= len || matchlen == 0) {
    152     return BROTLI_FALSE;
    153   }
    154   {
    155     size_t cut = len - matchlen;
    156     size_t transform_id = (cut << 2) +
    157         (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F);
    158     backward = max_backward + 1 + word_idx +
    159         (transform_id << dictionary->words->size_bits_by_length[len]);
    160   }
    161   if (backward > max_distance) {
    162     return BROTLI_FALSE;
    163   }
    164   score = BackwardReferenceScore(matchlen, backward);
    165   if (score < out->score) {
    166     return BROTLI_FALSE;
    167   }
    168   out->len = matchlen;
    169   out->len_code_delta = (int)len - (int)matchlen;
    170   out->distance = backward;
    171   out->score = score;
    172   return BROTLI_TRUE;
    173 }
    174 
    175 static BROTLI_INLINE void SearchInStaticDictionary(
    176     const BrotliEncoderDictionary* dictionary,
    177     HasherCommon* common, const uint8_t* data, size_t max_length,
    178     size_t max_backward, size_t max_distance,
    179     HasherSearchResult* out, BROTLI_BOOL shallow) {
    180   size_t key;
    181   size_t i;
    182   if (common->dict_num_matches < (common->dict_num_lookups >> 7)) {
    183     return;
    184   }
    185   key = Hash14(data) << 1;
    186   for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
    187     common->dict_num_lookups++;
    188     if (dictionary->hash_table_lengths[key] != 0) {
    189       BROTLI_BOOL item_matches = TestStaticDictionaryItem(
    190           dictionary, dictionary->hash_table_lengths[key],
    191           dictionary->hash_table_words[key], data,
    192           max_length, max_backward, max_distance, out);
    193       if (item_matches) {
    194         common->dict_num_matches++;
    195       }
    196     }
    197   }
    198 }
    199 
    200 typedef struct BackwardMatch {
    201   uint32_t distance;
    202   uint32_t length_and_code;
    203 } BackwardMatch;
    204 
    205 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
    206     size_t dist, size_t len) {
    207   self->distance = (uint32_t)dist;
    208   self->length_and_code = (uint32_t)(len << 5);
    209 }
    210 
    211 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
    212     size_t dist, size_t len, size_t len_code) {
    213   self->distance = (uint32_t)dist;
    214   self->length_and_code =
    215       (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
    216 }
    217 
    218 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
    219   return self->length_and_code >> 5;
    220 }
    221 
    222 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
    223   size_t code = self->length_and_code & 31;
    224   return code ? code : BackwardMatchLength(self);
    225 }
    226 
    227 #define EXPAND_CAT(a, b) CAT(a, b)
    228 #define CAT(a, b) a ## b
    229 #define FN(X) EXPAND_CAT(X, HASHER())
    230 
    231 #define HASHER() H10
    232 #define BUCKET_BITS 17
    233 #define MAX_TREE_SEARCH_DEPTH 64
    234 #define MAX_TREE_COMP_LENGTH 128
    235 #include "./hash_to_binary_tree_inc.h"  /* NOLINT(build/include) */
    236 #undef MAX_TREE_SEARCH_DEPTH
    237 #undef MAX_TREE_COMP_LENGTH
    238 #undef BUCKET_BITS
    239 #undef HASHER
    240 /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
    241 #define MAX_NUM_MATCHES_H10 128
    242 
    243 /* For BUCKET_SWEEP_BITS == 0, enabling the dictionary lookup makes compression
    244    a little faster (0.5% - 1%) and it compresses 0.15% better on small text
    245    and HTML inputs. */
    246 
    247 #define HASHER() H2
    248 #define BUCKET_BITS 16
    249 #define BUCKET_SWEEP_BITS 0
    250 #define HASH_LEN 5
    251 #define USE_DICTIONARY 1
    252 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    253 #undef BUCKET_SWEEP_BITS
    254 #undef USE_DICTIONARY
    255 #undef HASHER
    256 
    257 #define HASHER() H3
    258 #define BUCKET_SWEEP_BITS 1
    259 #define USE_DICTIONARY 0
    260 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    261 #undef USE_DICTIONARY
    262 #undef BUCKET_SWEEP_BITS
    263 #undef BUCKET_BITS
    264 #undef HASHER
    265 
    266 #define HASHER() H4
    267 #define BUCKET_BITS 17
    268 #define BUCKET_SWEEP_BITS 2
    269 #define USE_DICTIONARY 1
    270 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    271 #undef USE_DICTIONARY
    272 #undef HASH_LEN
    273 #undef BUCKET_SWEEP_BITS
    274 #undef BUCKET_BITS
    275 #undef HASHER
    276 
    277 #define HASHER() H5
    278 #include "./hash_longest_match_inc.h"  /* NOLINT(build/include) */
    279 #undef HASHER
    280 
    281 #define HASHER() H6
    282 #include "./hash_longest_match64_inc.h"  /* NOLINT(build/include) */
    283 #undef HASHER
    284 
    285 #define BUCKET_BITS 15
    286 
    287 #define NUM_LAST_DISTANCES_TO_CHECK 4
    288 #define NUM_BANKS 1
    289 #define BANK_BITS 16
    290 #define HASHER() H40
    291 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
    292 #undef HASHER
    293 #undef NUM_LAST_DISTANCES_TO_CHECK
    294 
    295 #define NUM_LAST_DISTANCES_TO_CHECK 10
    296 #define HASHER() H41
    297 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
    298 #undef HASHER
    299 #undef NUM_LAST_DISTANCES_TO_CHECK
    300 #undef NUM_BANKS
    301 #undef BANK_BITS
    302 
    303 #define NUM_LAST_DISTANCES_TO_CHECK 16
    304 #define NUM_BANKS 512
    305 #define BANK_BITS 9
    306 #define HASHER() H42
    307 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
    308 #undef HASHER
    309 #undef NUM_LAST_DISTANCES_TO_CHECK
    310 #undef NUM_BANKS
    311 #undef BANK_BITS
    312 
    313 #undef BUCKET_BITS
    314 
    315 #define HASHER() H54
    316 #define BUCKET_BITS 20
    317 #define BUCKET_SWEEP_BITS 2
    318 #define HASH_LEN 7
    319 #define USE_DICTIONARY 0
    320 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
    321 #undef USE_DICTIONARY
    322 #undef HASH_LEN
    323 #undef BUCKET_SWEEP_BITS
    324 #undef BUCKET_BITS
    325 #undef HASHER
    326 
    327 /* fast large window hashers */
    328 
    329 #define HASHER() HROLLING_FAST
    330 #define CHUNKLEN 32
    331 #define JUMP 4
    332 #define NUMBUCKETS 16777216
    333 #define MASK ((NUMBUCKETS * 64) - 1)
    334 #include "./hash_rolling_inc.h"  /* NOLINT(build/include) */
    335 #undef JUMP
    336 #undef HASHER
    337 
    338 
    339 #define HASHER() HROLLING
    340 #define JUMP 1
    341 #include "./hash_rolling_inc.h"  /* NOLINT(build/include) */
    342 #undef MASK
    343 #undef NUMBUCKETS
    344 #undef JUMP
    345 #undef CHUNKLEN
    346 #undef HASHER
    347 
    348 #define HASHER() H35
    349 #define HASHER_A H3
    350 #define HASHER_B HROLLING_FAST
    351 #include "./hash_composite_inc.h"  /* NOLINT(build/include) */
    352 #undef HASHER_A
    353 #undef HASHER_B
    354 #undef HASHER
    355 
    356 #define HASHER() H55
    357 #define HASHER_A H54
    358 #define HASHER_B HROLLING_FAST
    359 #include "./hash_composite_inc.h"  /* NOLINT(build/include) */
    360 #undef HASHER_A
    361 #undef HASHER_B
    362 #undef HASHER
    363 
    364 #define HASHER() H65
    365 #define HASHER_A H6
    366 #define HASHER_B HROLLING
    367 #include "./hash_composite_inc.h"  /* NOLINT(build/include) */
    368 #undef HASHER_A
    369 #undef HASHER_B
    370 #undef HASHER
    371 
    372 #undef FN
    373 #undef CAT
    374 #undef EXPAND_CAT
    375 
    376 #define FOR_SIMPLE_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)
    377 #define FOR_COMPOSITE_HASHERS(H) H(35) H(55) H(65)
    378 #define FOR_GENERIC_HASHERS(H) FOR_SIMPLE_HASHERS(H) FOR_COMPOSITE_HASHERS(H)
    379 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
    380 
    381 typedef struct {
    382   HasherCommon common;
    383 
    384   union {
    385 #define MEMBER_(N) \
    386     H ## N _H ## N;
    387     FOR_ALL_HASHERS(MEMBER_)
    388 #undef MEMBER_
    389   } privat;
    390 } Hasher;
    391 
    392 /* MUST be invoked before any other method. */
    393 static BROTLI_INLINE void HasherInit(Hasher* hasher) {
    394   hasher->common.extra = NULL;
    395 }
    396 
    397 static BROTLI_INLINE void DestroyHasher(MemoryManager* m, Hasher* hasher) {
    398   if (hasher->common.extra == NULL) return;
    399   BROTLI_FREE(m, hasher->common.extra);
    400 }
    401 
    402 static BROTLI_INLINE void HasherReset(Hasher* hasher) {
    403   hasher->common.is_prepared_ = BROTLI_FALSE;
    404 }
    405 
    406 static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
    407     BROTLI_BOOL one_shot, const size_t input_size) {
    408   switch (params->hasher.type) {
    409 #define SIZE_(N)                                                      \
    410     case N:                                                           \
    411       return HashMemAllocInBytesH ## N(params, one_shot, input_size);
    412     FOR_ALL_HASHERS(SIZE_)
    413 #undef SIZE_
    414     default:
    415       break;
    416   }
    417   return 0;  /* Default case. */
    418 }
    419 
    420 static BROTLI_INLINE void HasherSetup(MemoryManager* m, Hasher* hasher,
    421     BrotliEncoderParams* params, const uint8_t* data, size_t position,
    422     size_t input_size, BROTLI_BOOL is_last) {
    423   BROTLI_BOOL one_shot = (position == 0 && is_last);
    424   if (hasher->common.extra == NULL) {
    425     size_t alloc_size;
    426     ChooseHasher(params, &params->hasher);
    427     alloc_size = HasherSize(params, one_shot, input_size);
    428     hasher->common.extra = BROTLI_ALLOC(m, uint8_t, alloc_size);
    429     if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(hasher->common.extra)) return;
    430     hasher->common.params = params->hasher;
    431     switch (hasher->common.params.type) {
    432 #define INITIALIZE_(N)                        \
    433       case N:                                 \
    434         InitializeH ## N(&hasher->common,     \
    435             &hasher->privat._H ## N, params); \
    436         break;
    437       FOR_ALL_HASHERS(INITIALIZE_);
    438 #undef INITIALIZE_
    439       default:
    440         break;
    441     }
    442     HasherReset(hasher);
    443   }
    444 
    445   if (!hasher->common.is_prepared_) {
    446     switch (hasher->common.params.type) {
    447 #define PREPARE_(N)                      \
    448       case N:                            \
    449         PrepareH ## N(                   \
    450             &hasher->privat._H ## N,     \
    451             one_shot, input_size, data); \
    452         break;
    453       FOR_ALL_HASHERS(PREPARE_)
    454 #undef PREPARE_
    455       default: break;
    456     }
    457     if (position == 0) {
    458       hasher->common.dict_num_lookups = 0;
    459       hasher->common.dict_num_matches = 0;
    460     }
    461     hasher->common.is_prepared_ = BROTLI_TRUE;
    462   }
    463 }
    464 
    465 static BROTLI_INLINE void InitOrStitchToPreviousBlock(
    466     MemoryManager* m, Hasher* hasher, const uint8_t* data, size_t mask,
    467     BrotliEncoderParams* params, size_t position, size_t input_size,
    468     BROTLI_BOOL is_last) {
    469   HasherSetup(m, hasher, params, data, position, input_size, is_last);
    470   if (BROTLI_IS_OOM(m)) return;
    471   switch (hasher->common.params.type) {
    472 #define INIT_(N)                             \
    473     case N:                                  \
    474       StitchToPreviousBlockH ## N(           \
    475           &hasher->privat._H ## N,           \
    476           input_size, position, data, mask); \
    477     break;
    478     FOR_ALL_HASHERS(INIT_)
    479 #undef INIT_
    480     default: break;
    481   }
    482 }
    483 
    484 #if defined(__cplusplus) || defined(c_plusplus)
    485 }  /* extern "C" */
    486 #endif
    487 
    488 #endif  /* BROTLI_ENC_HASH_H_ */
    489