126fa459cSmrg/* Copyright 2013 Google Inc. All Rights Reserved. 226fa459cSmrg 326fa459cSmrg Distributed under MIT license. 426fa459cSmrg See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 526fa459cSmrg*/ 626fa459cSmrg 726fa459cSmrg/* Literal cost model to allow backward reference replacement to be efficient. 826fa459cSmrg*/ 926fa459cSmrg 1026fa459cSmrg#include "./literal_cost.h" 1126fa459cSmrg 1226fa459cSmrg#include "../common/platform.h" 1326fa459cSmrg#include <brotli/types.h> 1426fa459cSmrg#include "./fast_log.h" 1526fa459cSmrg#include "./utf8_util.h" 1626fa459cSmrg 1726fa459cSmrg#if defined(__cplusplus) || defined(c_plusplus) 1826fa459cSmrgextern "C" { 1926fa459cSmrg#endif 2026fa459cSmrg 2126fa459cSmrgstatic size_t UTF8Position(size_t last, size_t c, size_t clamp) { 2226fa459cSmrg if (c < 128) { 2326fa459cSmrg return 0; /* Next one is the 'Byte 1' again. */ 2426fa459cSmrg } else if (c >= 192) { /* Next one is the 'Byte 2' of utf-8 encoding. */ 2526fa459cSmrg return BROTLI_MIN(size_t, 1, clamp); 2626fa459cSmrg } else { 2726fa459cSmrg /* Let's decide over the last byte if this ends the sequence. */ 2826fa459cSmrg if (last < 0xE0) { 2926fa459cSmrg return 0; /* Completed two or three byte coding. */ 3026fa459cSmrg } else { /* Next one is the 'Byte 3' of utf-8 encoding. */ 3126fa459cSmrg return BROTLI_MIN(size_t, 2, clamp); 3226fa459cSmrg } 3326fa459cSmrg } 3426fa459cSmrg} 3526fa459cSmrg 3626fa459cSmrgstatic size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask, 3726fa459cSmrg const uint8_t* data) { 3826fa459cSmrg size_t counts[3] = { 0 }; 3926fa459cSmrg size_t max_utf8 = 1; /* should be 2, but 1 compresses better. */ 4026fa459cSmrg size_t last_c = 0; 4126fa459cSmrg size_t i; 4226fa459cSmrg for (i = 0; i < len; ++i) { 4326fa459cSmrg size_t c = data[(pos + i) & mask]; 4426fa459cSmrg ++counts[UTF8Position(last_c, c, 2)]; 4526fa459cSmrg last_c = c; 4626fa459cSmrg } 4726fa459cSmrg if (counts[2] < 500) { 4826fa459cSmrg max_utf8 = 1; 4926fa459cSmrg } 5026fa459cSmrg if (counts[1] + counts[2] < 25) { 5126fa459cSmrg max_utf8 = 0; 5226fa459cSmrg } 5326fa459cSmrg return max_utf8; 5426fa459cSmrg} 5526fa459cSmrg 5626fa459cSmrgstatic void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, 5726fa459cSmrg const uint8_t* data, float* cost) { 5826fa459cSmrg /* max_utf8 is 0 (normal ASCII single byte modeling), 5926fa459cSmrg 1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */ 6026fa459cSmrg const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data); 6126fa459cSmrg size_t histogram[3][256] = { { 0 } }; 6226fa459cSmrg size_t window_half = 495; 6326fa459cSmrg size_t in_window = BROTLI_MIN(size_t, window_half, len); 6426fa459cSmrg size_t in_window_utf8[3] = { 0 }; 6526fa459cSmrg 6626fa459cSmrg size_t i; 6726fa459cSmrg { /* Bootstrap histograms. */ 6826fa459cSmrg size_t last_c = 0; 6926fa459cSmrg size_t utf8_pos = 0; 7026fa459cSmrg for (i = 0; i < in_window; ++i) { 7126fa459cSmrg size_t c = data[(pos + i) & mask]; 7226fa459cSmrg ++histogram[utf8_pos][c]; 7326fa459cSmrg ++in_window_utf8[utf8_pos]; 7426fa459cSmrg utf8_pos = UTF8Position(last_c, c, max_utf8); 7526fa459cSmrg last_c = c; 7626fa459cSmrg } 7726fa459cSmrg } 7826fa459cSmrg 7926fa459cSmrg /* Compute bit costs with sliding window. */ 8026fa459cSmrg for (i = 0; i < len; ++i) { 8126fa459cSmrg if (i >= window_half) { 8226fa459cSmrg /* Remove a byte in the past. */ 8326fa459cSmrg size_t c = 8426fa459cSmrg i < window_half + 1 ? 0 : data[(pos + i - window_half - 1) & mask]; 8526fa459cSmrg size_t last_c = 8626fa459cSmrg i < window_half + 2 ? 0 : data[(pos + i - window_half - 2) & mask]; 8726fa459cSmrg size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8); 8826fa459cSmrg --histogram[utf8_pos2][data[(pos + i - window_half) & mask]]; 8926fa459cSmrg --in_window_utf8[utf8_pos2]; 9026fa459cSmrg } 9126fa459cSmrg if (i + window_half < len) { 9226fa459cSmrg /* Add a byte in the future. */ 9326fa459cSmrg size_t c = data[(pos + i + window_half - 1) & mask]; 9426fa459cSmrg size_t last_c = data[(pos + i + window_half - 2) & mask]; 9526fa459cSmrg size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8); 9626fa459cSmrg ++histogram[utf8_pos2][data[(pos + i + window_half) & mask]]; 9726fa459cSmrg ++in_window_utf8[utf8_pos2]; 9826fa459cSmrg } 9926fa459cSmrg { 10026fa459cSmrg size_t c = i < 1 ? 0 : data[(pos + i - 1) & mask]; 10126fa459cSmrg size_t last_c = i < 2 ? 0 : data[(pos + i - 2) & mask]; 10226fa459cSmrg size_t utf8_pos = UTF8Position(last_c, c, max_utf8); 10326fa459cSmrg size_t masked_pos = (pos + i) & mask; 10426fa459cSmrg size_t histo = histogram[utf8_pos][data[masked_pos]]; 10526fa459cSmrg double lit_cost; 10626fa459cSmrg if (histo == 0) { 10726fa459cSmrg histo = 1; 10826fa459cSmrg } 10926fa459cSmrg lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo); 11026fa459cSmrg lit_cost += 0.02905; 11126fa459cSmrg if (lit_cost < 1.0) { 11226fa459cSmrg lit_cost *= 0.5; 11326fa459cSmrg lit_cost += 0.5; 11426fa459cSmrg } 11526fa459cSmrg /* Make the first bytes more expensive -- seems to help, not sure why. 11626fa459cSmrg Perhaps because the entropy source is changing its properties 11726fa459cSmrg rapidly in the beginning of the file, perhaps because the beginning 11826fa459cSmrg of the data is a statistical "anomaly". */ 11926fa459cSmrg if (i < 2000) { 12026fa459cSmrg lit_cost += 0.7 - ((double)(2000 - i) / 2000.0 * 0.35); 12126fa459cSmrg } 12226fa459cSmrg cost[i] = (float)lit_cost; 12326fa459cSmrg } 12426fa459cSmrg } 12526fa459cSmrg} 12626fa459cSmrg 12726fa459cSmrgvoid BrotliEstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, 12826fa459cSmrg const uint8_t* data, float* cost) { 12926fa459cSmrg if (BrotliIsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) { 13026fa459cSmrg EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost); 13126fa459cSmrg return; 13226fa459cSmrg } else { 13326fa459cSmrg size_t histogram[256] = { 0 }; 13426fa459cSmrg size_t window_half = 2000; 13526fa459cSmrg size_t in_window = BROTLI_MIN(size_t, window_half, len); 13626fa459cSmrg 13726fa459cSmrg /* Bootstrap histogram. */ 13826fa459cSmrg size_t i; 13926fa459cSmrg for (i = 0; i < in_window; ++i) { 14026fa459cSmrg ++histogram[data[(pos + i) & mask]]; 14126fa459cSmrg } 14226fa459cSmrg 14326fa459cSmrg /* Compute bit costs with sliding window. */ 14426fa459cSmrg for (i = 0; i < len; ++i) { 14526fa459cSmrg size_t histo; 14626fa459cSmrg if (i >= window_half) { 14726fa459cSmrg /* Remove a byte in the past. */ 14826fa459cSmrg --histogram[data[(pos + i - window_half) & mask]]; 14926fa459cSmrg --in_window; 15026fa459cSmrg } 15126fa459cSmrg if (i + window_half < len) { 15226fa459cSmrg /* Add a byte in the future. */ 15326fa459cSmrg ++histogram[data[(pos + i + window_half) & mask]]; 15426fa459cSmrg ++in_window; 15526fa459cSmrg } 15626fa459cSmrg histo = histogram[data[(pos + i) & mask]]; 15726fa459cSmrg if (histo == 0) { 15826fa459cSmrg histo = 1; 15926fa459cSmrg } 16026fa459cSmrg { 16126fa459cSmrg double lit_cost = FastLog2(in_window) - FastLog2(histo); 16226fa459cSmrg lit_cost += 0.029; 16326fa459cSmrg if (lit_cost < 1.0) { 16426fa459cSmrg lit_cost *= 0.5; 16526fa459cSmrg lit_cost += 0.5; 16626fa459cSmrg } 16726fa459cSmrg cost[i] = (float)lit_cost; 16826fa459cSmrg } 16926fa459cSmrg } 17026fa459cSmrg } 17126fa459cSmrg} 17226fa459cSmrg 17326fa459cSmrg#if defined(__cplusplus) || defined(c_plusplus) 17426fa459cSmrg} /* extern "C" */ 17526fa459cSmrg#endif 176