126fa459cSmrg/* Copyright 2013 Google Inc. All Rights Reserved.
226fa459cSmrg
326fa459cSmrg   Distributed under MIT license.
426fa459cSmrg   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
526fa459cSmrg*/
626fa459cSmrg
726fa459cSmrg/* Literal cost model to allow backward reference replacement to be efficient.
826fa459cSmrg*/
926fa459cSmrg
1026fa459cSmrg#include "./literal_cost.h"
1126fa459cSmrg
1226fa459cSmrg#include "../common/platform.h"
1326fa459cSmrg#include <brotli/types.h>
1426fa459cSmrg#include "./fast_log.h"
1526fa459cSmrg#include "./utf8_util.h"
1626fa459cSmrg
1726fa459cSmrg#if defined(__cplusplus) || defined(c_plusplus)
1826fa459cSmrgextern "C" {
1926fa459cSmrg#endif
2026fa459cSmrg
2126fa459cSmrgstatic size_t UTF8Position(size_t last, size_t c, size_t clamp) {
2226fa459cSmrg  if (c < 128) {
2326fa459cSmrg    return 0;  /* Next one is the 'Byte 1' again. */
2426fa459cSmrg  } else if (c >= 192) {  /* Next one is the 'Byte 2' of utf-8 encoding. */
2526fa459cSmrg    return BROTLI_MIN(size_t, 1, clamp);
2626fa459cSmrg  } else {
2726fa459cSmrg    /* Let's decide over the last byte if this ends the sequence. */
2826fa459cSmrg    if (last < 0xE0) {
2926fa459cSmrg      return 0;  /* Completed two or three byte coding. */
3026fa459cSmrg    } else {  /* Next one is the 'Byte 3' of utf-8 encoding. */
3126fa459cSmrg      return BROTLI_MIN(size_t, 2, clamp);
3226fa459cSmrg    }
3326fa459cSmrg  }
3426fa459cSmrg}
3526fa459cSmrg
3626fa459cSmrgstatic size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
3726fa459cSmrg                                        const uint8_t* data) {
3826fa459cSmrg  size_t counts[3] = { 0 };
3926fa459cSmrg  size_t max_utf8 = 1;  /* should be 2, but 1 compresses better. */
4026fa459cSmrg  size_t last_c = 0;
4126fa459cSmrg  size_t i;
4226fa459cSmrg  for (i = 0; i < len; ++i) {
4326fa459cSmrg    size_t c = data[(pos + i) & mask];
4426fa459cSmrg    ++counts[UTF8Position(last_c, c, 2)];
4526fa459cSmrg    last_c = c;
4626fa459cSmrg  }
4726fa459cSmrg  if (counts[2] < 500) {
4826fa459cSmrg    max_utf8 = 1;
4926fa459cSmrg  }
5026fa459cSmrg  if (counts[1] + counts[2] < 25) {
5126fa459cSmrg    max_utf8 = 0;
5226fa459cSmrg  }
5326fa459cSmrg  return max_utf8;
5426fa459cSmrg}
5526fa459cSmrg
5626fa459cSmrgstatic void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
5726fa459cSmrg                                            const uint8_t* data, float* cost) {
5826fa459cSmrg  /* max_utf8 is 0 (normal ASCII single byte modeling),
5926fa459cSmrg     1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */
6026fa459cSmrg  const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
6126fa459cSmrg  size_t histogram[3][256] = { { 0 } };
6226fa459cSmrg  size_t window_half = 495;
6326fa459cSmrg  size_t in_window = BROTLI_MIN(size_t, window_half, len);
6426fa459cSmrg  size_t in_window_utf8[3] = { 0 };
6526fa459cSmrg
6626fa459cSmrg  size_t i;
6726fa459cSmrg  {  /* Bootstrap histograms. */
6826fa459cSmrg    size_t last_c = 0;
6926fa459cSmrg    size_t utf8_pos = 0;
7026fa459cSmrg    for (i = 0; i < in_window; ++i) {
7126fa459cSmrg      size_t c = data[(pos + i) & mask];
7226fa459cSmrg      ++histogram[utf8_pos][c];
7326fa459cSmrg      ++in_window_utf8[utf8_pos];
7426fa459cSmrg      utf8_pos = UTF8Position(last_c, c, max_utf8);
7526fa459cSmrg      last_c = c;
7626fa459cSmrg    }
7726fa459cSmrg  }
7826fa459cSmrg
7926fa459cSmrg  /* Compute bit costs with sliding window. */
8026fa459cSmrg  for (i = 0; i < len; ++i) {
8126fa459cSmrg    if (i >= window_half) {
8226fa459cSmrg      /* Remove a byte in the past. */
8326fa459cSmrg      size_t c =
8426fa459cSmrg          i < window_half + 1 ? 0 : data[(pos + i - window_half - 1) & mask];
8526fa459cSmrg      size_t last_c =
8626fa459cSmrg          i < window_half + 2 ? 0 : data[(pos + i - window_half - 2) & mask];
8726fa459cSmrg      size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
8826fa459cSmrg      --histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
8926fa459cSmrg      --in_window_utf8[utf8_pos2];
9026fa459cSmrg    }
9126fa459cSmrg    if (i + window_half < len) {
9226fa459cSmrg      /* Add a byte in the future. */
9326fa459cSmrg      size_t c = data[(pos + i + window_half - 1) & mask];
9426fa459cSmrg      size_t last_c = data[(pos + i + window_half - 2) & mask];
9526fa459cSmrg      size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
9626fa459cSmrg      ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
9726fa459cSmrg      ++in_window_utf8[utf8_pos2];
9826fa459cSmrg    }
9926fa459cSmrg    {
10026fa459cSmrg      size_t c = i < 1 ? 0 : data[(pos + i - 1) & mask];
10126fa459cSmrg      size_t last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
10226fa459cSmrg      size_t utf8_pos = UTF8Position(last_c, c, max_utf8);
10326fa459cSmrg      size_t masked_pos = (pos + i) & mask;
10426fa459cSmrg      size_t histo = histogram[utf8_pos][data[masked_pos]];
10526fa459cSmrg      double lit_cost;
10626fa459cSmrg      if (histo == 0) {
10726fa459cSmrg        histo = 1;
10826fa459cSmrg      }
10926fa459cSmrg      lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo);
11026fa459cSmrg      lit_cost += 0.02905;
11126fa459cSmrg      if (lit_cost < 1.0) {
11226fa459cSmrg        lit_cost *= 0.5;
11326fa459cSmrg        lit_cost += 0.5;
11426fa459cSmrg      }
11526fa459cSmrg      /* Make the first bytes more expensive -- seems to help, not sure why.
11626fa459cSmrg         Perhaps because the entropy source is changing its properties
11726fa459cSmrg         rapidly in the beginning of the file, perhaps because the beginning
11826fa459cSmrg         of the data is a statistical "anomaly". */
11926fa459cSmrg      if (i < 2000) {
12026fa459cSmrg        lit_cost += 0.7 - ((double)(2000 - i) / 2000.0 * 0.35);
12126fa459cSmrg      }
12226fa459cSmrg      cost[i] = (float)lit_cost;
12326fa459cSmrg    }
12426fa459cSmrg  }
12526fa459cSmrg}
12626fa459cSmrg
12726fa459cSmrgvoid BrotliEstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
12826fa459cSmrg                                       const uint8_t* data, float* cost) {
12926fa459cSmrg  if (BrotliIsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) {
13026fa459cSmrg    EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost);
13126fa459cSmrg    return;
13226fa459cSmrg  } else {
13326fa459cSmrg    size_t histogram[256] = { 0 };
13426fa459cSmrg    size_t window_half = 2000;
13526fa459cSmrg    size_t in_window = BROTLI_MIN(size_t, window_half, len);
13626fa459cSmrg
13726fa459cSmrg    /* Bootstrap histogram. */
13826fa459cSmrg    size_t i;
13926fa459cSmrg    for (i = 0; i < in_window; ++i) {
14026fa459cSmrg      ++histogram[data[(pos + i) & mask]];
14126fa459cSmrg    }
14226fa459cSmrg
14326fa459cSmrg    /* Compute bit costs with sliding window. */
14426fa459cSmrg    for (i = 0; i < len; ++i) {
14526fa459cSmrg      size_t histo;
14626fa459cSmrg      if (i >= window_half) {
14726fa459cSmrg        /* Remove a byte in the past. */
14826fa459cSmrg        --histogram[data[(pos + i - window_half) & mask]];
14926fa459cSmrg        --in_window;
15026fa459cSmrg      }
15126fa459cSmrg      if (i + window_half < len) {
15226fa459cSmrg        /* Add a byte in the future. */
15326fa459cSmrg        ++histogram[data[(pos + i + window_half) & mask]];
15426fa459cSmrg        ++in_window;
15526fa459cSmrg      }
15626fa459cSmrg      histo = histogram[data[(pos + i) & mask]];
15726fa459cSmrg      if (histo == 0) {
15826fa459cSmrg        histo = 1;
15926fa459cSmrg      }
16026fa459cSmrg      {
16126fa459cSmrg        double lit_cost = FastLog2(in_window) - FastLog2(histo);
16226fa459cSmrg        lit_cost += 0.029;
16326fa459cSmrg        if (lit_cost < 1.0) {
16426fa459cSmrg          lit_cost *= 0.5;
16526fa459cSmrg          lit_cost += 0.5;
16626fa459cSmrg        }
16726fa459cSmrg        cost[i] = (float)lit_cost;
16826fa459cSmrg      }
16926fa459cSmrg    }
17026fa459cSmrg  }
17126fa459cSmrg}
17226fa459cSmrg
17326fa459cSmrg#if defined(__cplusplus) || defined(c_plusplus)
17426fa459cSmrg}  /* extern "C" */
17526fa459cSmrg#endif
176