1/* Copyright 2013 Google Inc. All Rights Reserved.
2
3   Distributed under MIT license.
4   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*/
6
7#include "./transform.h"
8
9#if defined(__cplusplus) || defined(c_plusplus)
10extern "C" {
11#endif
12
13/* RFC 7932 transforms string data */
14static const char kPrefixSuffix[217] =
15      "\1 \2, \10 of the \4 of \2s \1.\5 and \4 "
16/* 0x  _0 _2  __5        _E    _3  _6 _8     _E */
17      "in \1\"\4 to \2\">\1\n\2. \1]\5 for \3 a \6 "
18/* 2x     _3_ _5    _A_  _D_ _F  _2 _4     _A   _E */
19      "that \1\'\6 with \6 from \4 by \1(\6. T"
20/* 4x       _5_ _7      _E      _5    _A _C */
21      "he \4 on \4 as \4 is \4ing \2\n\t\1:\3ed "
22/* 6x     _3    _8    _D    _2    _7_ _ _A _C */
23      "\2=\"\4 at \3ly \1,\2=\'\5.com/\7. This \5"
24/* 8x  _0 _ _3    _8   _C _E _ _1     _7       _F */
25      " not \3er \3al \4ful \4ive \5less \4es"
26/* Ax       _5   _9   _D    _2    _7     _D */
27      "t \4ize \2\xc2\xa0\4ous \5 the \2e "; /* \0 - implicit trailing zero. */
28/* Cx    _2    _7___ ___ _A    _F     _5        _8 */
29
30static const uint16_t kPrefixSuffixMap[50] = {
31  0x00, 0x02, 0x05, 0x0E, 0x13, 0x16, 0x18, 0x1E, 0x23, 0x25,
32  0x2A, 0x2D, 0x2F, 0x32, 0x34, 0x3A, 0x3E, 0x45, 0x47, 0x4E,
33  0x55, 0x5A, 0x5C, 0x63, 0x68, 0x6D, 0x72, 0x77, 0x7A, 0x7C,
34  0x80, 0x83, 0x88, 0x8C, 0x8E, 0x91, 0x97, 0x9F, 0xA5, 0xA9,
35  0xAD, 0xB2, 0xB7, 0xBD, 0xC2, 0xC7, 0xCA, 0xCF, 0xD5, 0xD8
36};
37
38/* RFC 7932 transforms */
39static const uint8_t kTransformsData[] = {
40  49, BROTLI_TRANSFORM_IDENTITY, 49,
41  49, BROTLI_TRANSFORM_IDENTITY, 0,
42   0, BROTLI_TRANSFORM_IDENTITY, 0,
43  49, BROTLI_TRANSFORM_OMIT_FIRST_1, 49,
44  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 0,
45  49, BROTLI_TRANSFORM_IDENTITY, 47,
46   0, BROTLI_TRANSFORM_IDENTITY, 49,
47   4, BROTLI_TRANSFORM_IDENTITY, 0,
48  49, BROTLI_TRANSFORM_IDENTITY, 3,
49  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 49,
50  49, BROTLI_TRANSFORM_IDENTITY, 6,
51  49, BROTLI_TRANSFORM_OMIT_FIRST_2, 49,
52  49, BROTLI_TRANSFORM_OMIT_LAST_1, 49,
53   1, BROTLI_TRANSFORM_IDENTITY, 0,
54  49, BROTLI_TRANSFORM_IDENTITY, 1,
55   0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 0,
56  49, BROTLI_TRANSFORM_IDENTITY, 7,
57  49, BROTLI_TRANSFORM_IDENTITY, 9,
58  48, BROTLI_TRANSFORM_IDENTITY, 0,
59  49, BROTLI_TRANSFORM_IDENTITY, 8,
60  49, BROTLI_TRANSFORM_IDENTITY, 5,
61  49, BROTLI_TRANSFORM_IDENTITY, 10,
62  49, BROTLI_TRANSFORM_IDENTITY, 11,
63  49, BROTLI_TRANSFORM_OMIT_LAST_3, 49,
64  49, BROTLI_TRANSFORM_IDENTITY, 13,
65  49, BROTLI_TRANSFORM_IDENTITY, 14,
66  49, BROTLI_TRANSFORM_OMIT_FIRST_3, 49,
67  49, BROTLI_TRANSFORM_OMIT_LAST_2, 49,
68  49, BROTLI_TRANSFORM_IDENTITY, 15,
69  49, BROTLI_TRANSFORM_IDENTITY, 16,
70   0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 49,
71  49, BROTLI_TRANSFORM_IDENTITY, 12,
72   5, BROTLI_TRANSFORM_IDENTITY, 49,
73   0, BROTLI_TRANSFORM_IDENTITY, 1,
74  49, BROTLI_TRANSFORM_OMIT_FIRST_4, 49,
75  49, BROTLI_TRANSFORM_IDENTITY, 18,
76  49, BROTLI_TRANSFORM_IDENTITY, 17,
77  49, BROTLI_TRANSFORM_IDENTITY, 19,
78  49, BROTLI_TRANSFORM_IDENTITY, 20,
79  49, BROTLI_TRANSFORM_OMIT_FIRST_5, 49,
80  49, BROTLI_TRANSFORM_OMIT_FIRST_6, 49,
81  47, BROTLI_TRANSFORM_IDENTITY, 49,
82  49, BROTLI_TRANSFORM_OMIT_LAST_4, 49,
83  49, BROTLI_TRANSFORM_IDENTITY, 22,
84  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 49,
85  49, BROTLI_TRANSFORM_IDENTITY, 23,
86  49, BROTLI_TRANSFORM_IDENTITY, 24,
87  49, BROTLI_TRANSFORM_IDENTITY, 25,
88  49, BROTLI_TRANSFORM_OMIT_LAST_7, 49,
89  49, BROTLI_TRANSFORM_OMIT_LAST_1, 26,
90  49, BROTLI_TRANSFORM_IDENTITY, 27,
91  49, BROTLI_TRANSFORM_IDENTITY, 28,
92   0, BROTLI_TRANSFORM_IDENTITY, 12,
93  49, BROTLI_TRANSFORM_IDENTITY, 29,
94  49, BROTLI_TRANSFORM_OMIT_FIRST_9, 49,
95  49, BROTLI_TRANSFORM_OMIT_FIRST_7, 49,
96  49, BROTLI_TRANSFORM_OMIT_LAST_6, 49,
97  49, BROTLI_TRANSFORM_IDENTITY, 21,
98  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 1,
99  49, BROTLI_TRANSFORM_OMIT_LAST_8, 49,
100  49, BROTLI_TRANSFORM_IDENTITY, 31,
101  49, BROTLI_TRANSFORM_IDENTITY, 32,
102  47, BROTLI_TRANSFORM_IDENTITY, 3,
103  49, BROTLI_TRANSFORM_OMIT_LAST_5, 49,
104  49, BROTLI_TRANSFORM_OMIT_LAST_9, 49,
105   0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 1,
106  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 8,
107   5, BROTLI_TRANSFORM_IDENTITY, 21,
108  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 0,
109  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 10,
110  49, BROTLI_TRANSFORM_IDENTITY, 30,
111   0, BROTLI_TRANSFORM_IDENTITY, 5,
112  35, BROTLI_TRANSFORM_IDENTITY, 49,
113  47, BROTLI_TRANSFORM_IDENTITY, 2,
114  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 17,
115  49, BROTLI_TRANSFORM_IDENTITY, 36,
116  49, BROTLI_TRANSFORM_IDENTITY, 33,
117   5, BROTLI_TRANSFORM_IDENTITY, 0,
118  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 21,
119  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 5,
120  49, BROTLI_TRANSFORM_IDENTITY, 37,
121   0, BROTLI_TRANSFORM_IDENTITY, 30,
122  49, BROTLI_TRANSFORM_IDENTITY, 38,
123   0, BROTLI_TRANSFORM_UPPERCASE_ALL, 0,
124  49, BROTLI_TRANSFORM_IDENTITY, 39,
125   0, BROTLI_TRANSFORM_UPPERCASE_ALL, 49,
126  49, BROTLI_TRANSFORM_IDENTITY, 34,
127  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 8,
128  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 12,
129   0, BROTLI_TRANSFORM_IDENTITY, 21,
130  49, BROTLI_TRANSFORM_IDENTITY, 40,
131   0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 12,
132  49, BROTLI_TRANSFORM_IDENTITY, 41,
133  49, BROTLI_TRANSFORM_IDENTITY, 42,
134  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 17,
135  49, BROTLI_TRANSFORM_IDENTITY, 43,
136   0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 5,
137  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 10,
138   0, BROTLI_TRANSFORM_IDENTITY, 34,
139  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 33,
140  49, BROTLI_TRANSFORM_IDENTITY, 44,
141  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 5,
142  45, BROTLI_TRANSFORM_IDENTITY, 49,
143   0, BROTLI_TRANSFORM_IDENTITY, 33,
144  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 30,
145  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 30,
146  49, BROTLI_TRANSFORM_IDENTITY, 46,
147  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 1,
148  49, BROTLI_TRANSFORM_UPPERCASE_FIRST, 34,
149   0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 33,
150   0, BROTLI_TRANSFORM_UPPERCASE_ALL, 30,
151   0, BROTLI_TRANSFORM_UPPERCASE_ALL, 1,
152  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 33,
153  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 21,
154  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 12,
155   0, BROTLI_TRANSFORM_UPPERCASE_ALL, 5,
156  49, BROTLI_TRANSFORM_UPPERCASE_ALL, 34,
157   0, BROTLI_TRANSFORM_UPPERCASE_ALL, 12,
158   0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 30,
159   0, BROTLI_TRANSFORM_UPPERCASE_ALL, 34,
160   0, BROTLI_TRANSFORM_UPPERCASE_FIRST, 34,
161};
162
163static const BrotliTransforms kBrotliTransforms = {
164  sizeof(kPrefixSuffix),
165  (const uint8_t*)kPrefixSuffix,
166  kPrefixSuffixMap,
167  sizeof(kTransformsData) / (3 * sizeof(kTransformsData[0])),
168  kTransformsData,
169  NULL,  /* no extra parameters */
170  {0, 12, 27, 23, 42, 63, 56, 48, 59, 64}
171};
172
173const BrotliTransforms* BrotliGetTransforms(void) {
174  return &kBrotliTransforms;
175}
176
177static int ToUpperCase(uint8_t* p) {
178  if (p[0] < 0xC0) {
179    if (p[0] >= 'a' && p[0] <= 'z') {
180      p[0] ^= 32;
181    }
182    return 1;
183  }
184  /* An overly simplified uppercasing model for UTF-8. */
185  if (p[0] < 0xE0) {
186    p[1] ^= 32;
187    return 2;
188  }
189  /* An arbitrary transform for three byte characters. */
190  p[2] ^= 5;
191  return 3;
192}
193
194static int Shift(uint8_t* word, int word_len, uint16_t parameter) {
195  /* Limited sign extension: scalar < (1 << 24). */
196  uint32_t scalar =
197      (parameter & 0x7FFFu) + (0x1000000u - (parameter & 0x8000u));
198  if (word[0] < 0x80) {
199    /* 1-byte rune / 0sssssss / 7 bit scalar (ASCII). */
200    scalar += (uint32_t)word[0];
201    word[0] = (uint8_t)(scalar & 0x7Fu);
202    return 1;
203  } else if (word[0] < 0xC0) {
204    /* Continuation / 10AAAAAA. */
205    return 1;
206  } else if (word[0] < 0xE0) {
207    /* 2-byte rune / 110sssss AAssssss / 11 bit scalar. */
208    if (word_len < 2) return 1;
209    scalar += (uint32_t)((word[1] & 0x3Fu) | ((word[0] & 0x1Fu) << 6u));
210    word[0] = (uint8_t)(0xC0 | ((scalar >> 6u) & 0x1F));
211    word[1] = (uint8_t)((word[1] & 0xC0) | (scalar & 0x3F));
212    return 2;
213  } else if (word[0] < 0xF0) {
214    /* 3-byte rune / 1110ssss AAssssss BBssssss / 16 bit scalar. */
215    if (word_len < 3) return word_len;
216    scalar += (uint32_t)((word[2] & 0x3Fu) | ((word[1] & 0x3Fu) << 6u) |
217        ((word[0] & 0x0Fu) << 12u));
218    word[0] = (uint8_t)(0xE0 | ((scalar >> 12u) & 0x0F));
219    word[1] = (uint8_t)((word[1] & 0xC0) | ((scalar >> 6u) & 0x3F));
220    word[2] = (uint8_t)((word[2] & 0xC0) | (scalar & 0x3F));
221    return 3;
222  } else if (word[0] < 0xF8) {
223    /* 4-byte rune / 11110sss AAssssss BBssssss CCssssss / 21 bit scalar. */
224    if (word_len < 4) return word_len;
225    scalar += (uint32_t)((word[3] & 0x3Fu) | ((word[2] & 0x3Fu) << 6u) |
226        ((word[1] & 0x3Fu) << 12u) | ((word[0] & 0x07u) << 18u));
227    word[0] = (uint8_t)(0xF0 | ((scalar >> 18u) & 0x07));
228    word[1] = (uint8_t)((word[1] & 0xC0) | ((scalar >> 12u) & 0x3F));
229    word[2] = (uint8_t)((word[2] & 0xC0) | ((scalar >> 6u) & 0x3F));
230    word[3] = (uint8_t)((word[3] & 0xC0) | (scalar & 0x3F));
231    return 4;
232  }
233  return 1;
234}
235
236int BrotliTransformDictionaryWord(uint8_t* dst, const uint8_t* word, int len,
237    const BrotliTransforms* transforms, int transform_idx) {
238  int idx = 0;
239  const uint8_t* prefix = BROTLI_TRANSFORM_PREFIX(transforms, transform_idx);
240  uint8_t type = BROTLI_TRANSFORM_TYPE(transforms, transform_idx);
241  const uint8_t* suffix = BROTLI_TRANSFORM_SUFFIX(transforms, transform_idx);
242  {
243    int prefix_len = *prefix++;
244    while (prefix_len--) { dst[idx++] = *prefix++; }
245  }
246  {
247    const int t = type;
248    int i = 0;
249    if (t <= BROTLI_TRANSFORM_OMIT_LAST_9) {
250      len -= t;
251    } else if (t >= BROTLI_TRANSFORM_OMIT_FIRST_1
252        && t <= BROTLI_TRANSFORM_OMIT_FIRST_9) {
253      int skip = t - (BROTLI_TRANSFORM_OMIT_FIRST_1 - 1);
254      word += skip;
255      len -= skip;
256    }
257    while (i < len) { dst[idx++] = word[i++]; }
258    if (t == BROTLI_TRANSFORM_UPPERCASE_FIRST) {
259      ToUpperCase(&dst[idx - len]);
260    } else if (t == BROTLI_TRANSFORM_UPPERCASE_ALL) {
261      uint8_t* uppercase = &dst[idx - len];
262      while (len > 0) {
263        int step = ToUpperCase(uppercase);
264        uppercase += step;
265        len -= step;
266      }
267    } else if (t == BROTLI_TRANSFORM_SHIFT_FIRST) {
268      uint16_t param = (uint16_t)(transforms->params[transform_idx * 2]
269          + (transforms->params[transform_idx * 2 + 1] << 8u));
270      Shift(&dst[idx - len], len, param);
271    } else if (t == BROTLI_TRANSFORM_SHIFT_ALL) {
272      uint16_t param = (uint16_t)(transforms->params[transform_idx * 2]
273          + (transforms->params[transform_idx * 2 + 1] << 8u));
274      uint8_t* shift = &dst[idx - len];
275      while (len > 0) {
276        int step = Shift(shift, len, param);
277        shift += step;
278        len -= step;
279      }
280    }
281  }
282  {
283    int suffix_len = *suffix++;
284    while (suffix_len--) { dst[idx++] = *suffix++; }
285    return idx;
286  }
287}
288
289#if defined(__cplusplus) || defined(c_plusplus)
290}  /* extern "C" */
291#endif
292