cover.c revision 1.1 1 1.1 christos /*
2 1.1 christos * Copyright (c) Meta Platforms, Inc. and affiliates.
3 1.1 christos * All rights reserved.
4 1.1 christos *
5 1.1 christos * This source code is licensed under both the BSD-style license (found in the
6 1.1 christos * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 1.1 christos * in the COPYING file in the root directory of this source tree).
8 1.1 christos * You may select, at your option, one of the above-listed licenses.
9 1.1 christos */
10 1.1 christos
11 1.1 christos /* *****************************************************************************
12 1.1 christos * Constructs a dictionary using a heuristic based on the following paper:
13 1.1 christos *
14 1.1 christos * Liao, Petri, Moffat, Wirth
15 1.1 christos * Effective Construction of Relative Lempel-Ziv Dictionaries
16 1.1 christos * Published in WWW 2016.
17 1.1 christos *
18 1.1 christos * Adapted from code originally written by @ot (Giuseppe Ottaviano).
19 1.1 christos ******************************************************************************/
20 1.1 christos
21 1.1 christos /*-*************************************
22 1.1 christos * Dependencies
23 1.1 christos ***************************************/
24 1.1 christos #include <stdio.h> /* fprintf */
25 1.1 christos #include <stdlib.h> /* malloc, free, qsort */
26 1.1 christos #include <string.h> /* memset */
27 1.1 christos #include <time.h> /* clock */
28 1.1 christos
29 1.1 christos #ifndef ZDICT_STATIC_LINKING_ONLY
30 1.1 christos # define ZDICT_STATIC_LINKING_ONLY
31 1.1 christos #endif
32 1.1 christos
33 1.1 christos #include "../common/mem.h" /* read */
34 1.1 christos #include "../common/pool.h" /* POOL_ctx */
35 1.1 christos #include "../common/threading.h" /* ZSTD_pthread_mutex_t */
36 1.1 christos #include "../common/zstd_internal.h" /* includes zstd.h */
37 1.1 christos #include "../common/bits.h" /* ZSTD_highbit32 */
38 1.1 christos #include "../zdict.h"
39 1.1 christos #include "cover.h"
40 1.1 christos
41 1.1 christos /*-*************************************
42 1.1 christos * Constants
43 1.1 christos ***************************************/
44 1.1 christos /**
45 1.1 christos * There are 32bit indexes used to ref samples, so limit samples size to 4GB
46 1.1 christos * on 64bit builds.
47 1.1 christos * For 32bit builds we choose 1 GB.
48 1.1 christos * Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
49 1.1 christos * contiguous buffer, so 1GB is already a high limit.
50 1.1 christos */
51 1.1 christos #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
52 1.1 christos #define COVER_DEFAULT_SPLITPOINT 1.0
53 1.1 christos
54 1.1 christos /*-*************************************
55 1.1 christos * Console display
56 1.1 christos ***************************************/
57 1.1 christos #ifndef LOCALDISPLAYLEVEL
58 1.1 christos static int g_displayLevel = 0;
59 1.1 christos #endif
60 1.1 christos #undef DISPLAY
61 1.1 christos #define DISPLAY(...) \
62 1.1 christos { \
63 1.1 christos fprintf(stderr, __VA_ARGS__); \
64 1.1 christos fflush(stderr); \
65 1.1 christos }
66 1.1 christos #undef LOCALDISPLAYLEVEL
67 1.1 christos #define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
68 1.1 christos if (displayLevel >= l) { \
69 1.1 christos DISPLAY(__VA_ARGS__); \
70 1.1 christos } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
71 1.1 christos #undef DISPLAYLEVEL
72 1.1 christos #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
73 1.1 christos
74 1.1 christos #ifndef LOCALDISPLAYUPDATE
75 1.1 christos static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
76 1.1 christos static clock_t g_time = 0;
77 1.1 christos #endif
78 1.1 christos #undef LOCALDISPLAYUPDATE
79 1.1 christos #define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
80 1.1 christos if (displayLevel >= l) { \
81 1.1 christos if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \
82 1.1 christos g_time = clock(); \
83 1.1 christos DISPLAY(__VA_ARGS__); \
84 1.1 christos } \
85 1.1 christos }
86 1.1 christos #undef DISPLAYUPDATE
87 1.1 christos #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
88 1.1 christos
89 1.1 christos /*-*************************************
90 1.1 christos * Hash table
91 1.1 christos ***************************************
92 1.1 christos * A small specialized hash map for storing activeDmers.
93 1.1 christos * The map does not resize, so if it becomes full it will loop forever.
94 1.1 christos * Thus, the map must be large enough to store every value.
95 1.1 christos * The map implements linear probing and keeps its load less than 0.5.
96 1.1 christos */
97 1.1 christos
98 1.1 christos #define MAP_EMPTY_VALUE ((U32)-1)
99 1.1 christos typedef struct COVER_map_pair_t_s {
100 1.1 christos U32 key;
101 1.1 christos U32 value;
102 1.1 christos } COVER_map_pair_t;
103 1.1 christos
104 1.1 christos typedef struct COVER_map_s {
105 1.1 christos COVER_map_pair_t *data;
106 1.1 christos U32 sizeLog;
107 1.1 christos U32 size;
108 1.1 christos U32 sizeMask;
109 1.1 christos } COVER_map_t;
110 1.1 christos
111 1.1 christos /**
112 1.1 christos * Clear the map.
113 1.1 christos */
114 1.1 christos static void COVER_map_clear(COVER_map_t *map) {
115 1.1 christos memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
116 1.1 christos }
117 1.1 christos
118 1.1 christos /**
119 1.1 christos * Initializes a map of the given size.
120 1.1 christos * Returns 1 on success and 0 on failure.
121 1.1 christos * The map must be destroyed with COVER_map_destroy().
122 1.1 christos * The map is only guaranteed to be large enough to hold size elements.
123 1.1 christos */
124 1.1 christos static int COVER_map_init(COVER_map_t *map, U32 size) {
125 1.1 christos map->sizeLog = ZSTD_highbit32(size) + 2;
126 1.1 christos map->size = (U32)1 << map->sizeLog;
127 1.1 christos map->sizeMask = map->size - 1;
128 1.1 christos map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
129 1.1 christos if (!map->data) {
130 1.1 christos map->sizeLog = 0;
131 1.1 christos map->size = 0;
132 1.1 christos return 0;
133 1.1 christos }
134 1.1 christos COVER_map_clear(map);
135 1.1 christos return 1;
136 1.1 christos }
137 1.1 christos
138 1.1 christos /**
139 1.1 christos * Internal hash function
140 1.1 christos */
141 1.1 christos static const U32 COVER_prime4bytes = 2654435761U;
142 1.1 christos static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
143 1.1 christos return (key * COVER_prime4bytes) >> (32 - map->sizeLog);
144 1.1 christos }
145 1.1 christos
146 1.1 christos /**
147 1.1 christos * Helper function that returns the index that a key should be placed into.
148 1.1 christos */
149 1.1 christos static U32 COVER_map_index(COVER_map_t *map, U32 key) {
150 1.1 christos const U32 hash = COVER_map_hash(map, key);
151 1.1 christos U32 i;
152 1.1 christos for (i = hash;; i = (i + 1) & map->sizeMask) {
153 1.1 christos COVER_map_pair_t *pos = &map->data[i];
154 1.1 christos if (pos->value == MAP_EMPTY_VALUE) {
155 1.1 christos return i;
156 1.1 christos }
157 1.1 christos if (pos->key == key) {
158 1.1 christos return i;
159 1.1 christos }
160 1.1 christos }
161 1.1 christos }
162 1.1 christos
163 1.1 christos /**
164 1.1 christos * Returns the pointer to the value for key.
165 1.1 christos * If key is not in the map, it is inserted and the value is set to 0.
166 1.1 christos * The map must not be full.
167 1.1 christos */
168 1.1 christos static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
169 1.1 christos COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
170 1.1 christos if (pos->value == MAP_EMPTY_VALUE) {
171 1.1 christos pos->key = key;
172 1.1 christos pos->value = 0;
173 1.1 christos }
174 1.1 christos return &pos->value;
175 1.1 christos }
176 1.1 christos
177 1.1 christos /**
178 1.1 christos * Deletes key from the map if present.
179 1.1 christos */
180 1.1 christos static void COVER_map_remove(COVER_map_t *map, U32 key) {
181 1.1 christos U32 i = COVER_map_index(map, key);
182 1.1 christos COVER_map_pair_t *del = &map->data[i];
183 1.1 christos U32 shift = 1;
184 1.1 christos if (del->value == MAP_EMPTY_VALUE) {
185 1.1 christos return;
186 1.1 christos }
187 1.1 christos for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
188 1.1 christos COVER_map_pair_t *const pos = &map->data[i];
189 1.1 christos /* If the position is empty we are done */
190 1.1 christos if (pos->value == MAP_EMPTY_VALUE) {
191 1.1 christos del->value = MAP_EMPTY_VALUE;
192 1.1 christos return;
193 1.1 christos }
194 1.1 christos /* If pos can be moved to del do so */
195 1.1 christos if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
196 1.1 christos del->key = pos->key;
197 1.1 christos del->value = pos->value;
198 1.1 christos del = pos;
199 1.1 christos shift = 1;
200 1.1 christos } else {
201 1.1 christos ++shift;
202 1.1 christos }
203 1.1 christos }
204 1.1 christos }
205 1.1 christos
206 1.1 christos /**
207 1.1 christos * Destroys a map that is inited with COVER_map_init().
208 1.1 christos */
209 1.1 christos static void COVER_map_destroy(COVER_map_t *map) {
210 1.1 christos if (map->data) {
211 1.1 christos free(map->data);
212 1.1 christos }
213 1.1 christos map->data = NULL;
214 1.1 christos map->size = 0;
215 1.1 christos }
216 1.1 christos
217 1.1 christos /*-*************************************
218 1.1 christos * Context
219 1.1 christos ***************************************/
220 1.1 christos
221 1.1 christos typedef struct {
222 1.1 christos const BYTE *samples;
223 1.1 christos size_t *offsets;
224 1.1 christos const size_t *samplesSizes;
225 1.1 christos size_t nbSamples;
226 1.1 christos size_t nbTrainSamples;
227 1.1 christos size_t nbTestSamples;
228 1.1 christos U32 *suffix;
229 1.1 christos size_t suffixSize;
230 1.1 christos U32 *freqs;
231 1.1 christos U32 *dmerAt;
232 1.1 christos unsigned d;
233 1.1 christos } COVER_ctx_t;
234 1.1 christos
235 1.1 christos /* We need a global context for qsort... */
236 1.1 christos static COVER_ctx_t *g_coverCtx = NULL;
237 1.1 christos
238 1.1 christos /*-*************************************
239 1.1 christos * Helper functions
240 1.1 christos ***************************************/
241 1.1 christos
242 1.1 christos /**
243 1.1 christos * Returns the sum of the sample sizes.
244 1.1 christos */
245 1.1 christos size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
246 1.1 christos size_t sum = 0;
247 1.1 christos unsigned i;
248 1.1 christos for (i = 0; i < nbSamples; ++i) {
249 1.1 christos sum += samplesSizes[i];
250 1.1 christos }
251 1.1 christos return sum;
252 1.1 christos }
253 1.1 christos
254 1.1 christos /**
255 1.1 christos * Returns -1 if the dmer at lp is less than the dmer at rp.
256 1.1 christos * Return 0 if the dmers at lp and rp are equal.
257 1.1 christos * Returns 1 if the dmer at lp is greater than the dmer at rp.
258 1.1 christos */
259 1.1 christos static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
260 1.1 christos U32 const lhs = *(U32 const *)lp;
261 1.1 christos U32 const rhs = *(U32 const *)rp;
262 1.1 christos return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
263 1.1 christos }
264 1.1 christos /**
265 1.1 christos * Faster version for d <= 8.
266 1.1 christos */
267 1.1 christos static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
268 1.1 christos U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
269 1.1 christos U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
270 1.1 christos U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
271 1.1 christos if (lhs < rhs) {
272 1.1 christos return -1;
273 1.1 christos }
274 1.1 christos return (lhs > rhs);
275 1.1 christos }
276 1.1 christos
277 1.1 christos /**
278 1.1 christos * Same as COVER_cmp() except ties are broken by pointer value
279 1.1 christos * NOTE: g_coverCtx must be set to call this function. A global is required because
280 1.1 christos * qsort doesn't take an opaque pointer.
281 1.1 christos */
282 1.1 christos static int WIN_CDECL COVER_strict_cmp(const void *lp, const void *rp) {
283 1.1 christos int result = COVER_cmp(g_coverCtx, lp, rp);
284 1.1 christos if (result == 0) {
285 1.1 christos result = lp < rp ? -1 : 1;
286 1.1 christos }
287 1.1 christos return result;
288 1.1 christos }
289 1.1 christos /**
290 1.1 christos * Faster version for d <= 8.
291 1.1 christos */
292 1.1 christos static int WIN_CDECL COVER_strict_cmp8(const void *lp, const void *rp) {
293 1.1 christos int result = COVER_cmp8(g_coverCtx, lp, rp);
294 1.1 christos if (result == 0) {
295 1.1 christos result = lp < rp ? -1 : 1;
296 1.1 christos }
297 1.1 christos return result;
298 1.1 christos }
299 1.1 christos
300 1.1 christos /**
301 1.1 christos * Returns the first pointer in [first, last) whose element does not compare
302 1.1 christos * less than value. If no such element exists it returns last.
303 1.1 christos */
304 1.1 christos static const size_t *COVER_lower_bound(const size_t* first, const size_t* last,
305 1.1 christos size_t value) {
306 1.1 christos size_t count = (size_t)(last - first);
307 1.1 christos assert(last >= first);
308 1.1 christos while (count != 0) {
309 1.1 christos size_t step = count / 2;
310 1.1 christos const size_t *ptr = first;
311 1.1 christos ptr += step;
312 1.1 christos if (*ptr < value) {
313 1.1 christos first = ++ptr;
314 1.1 christos count -= step + 1;
315 1.1 christos } else {
316 1.1 christos count = step;
317 1.1 christos }
318 1.1 christos }
319 1.1 christos return first;
320 1.1 christos }
321 1.1 christos
322 1.1 christos /**
323 1.1 christos * Generic groupBy function.
324 1.1 christos * Groups an array sorted by cmp into groups with equivalent values.
325 1.1 christos * Calls grp for each group.
326 1.1 christos */
327 1.1 christos static void
328 1.1 christos COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
329 1.1 christos int (*cmp)(COVER_ctx_t *, const void *, const void *),
330 1.1 christos void (*grp)(COVER_ctx_t *, const void *, const void *)) {
331 1.1 christos const BYTE *ptr = (const BYTE *)data;
332 1.1 christos size_t num = 0;
333 1.1 christos while (num < count) {
334 1.1 christos const BYTE *grpEnd = ptr + size;
335 1.1 christos ++num;
336 1.1 christos while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
337 1.1 christos grpEnd += size;
338 1.1 christos ++num;
339 1.1 christos }
340 1.1 christos grp(ctx, ptr, grpEnd);
341 1.1 christos ptr = grpEnd;
342 1.1 christos }
343 1.1 christos }
344 1.1 christos
345 1.1 christos /*-*************************************
346 1.1 christos * Cover functions
347 1.1 christos ***************************************/
348 1.1 christos
349 1.1 christos /**
350 1.1 christos * Called on each group of positions with the same dmer.
351 1.1 christos * Counts the frequency of each dmer and saves it in the suffix array.
352 1.1 christos * Fills `ctx->dmerAt`.
353 1.1 christos */
354 1.1 christos static void COVER_group(COVER_ctx_t *ctx, const void *group,
355 1.1 christos const void *groupEnd) {
356 1.1 christos /* The group consists of all the positions with the same first d bytes. */
357 1.1 christos const U32 *grpPtr = (const U32 *)group;
358 1.1 christos const U32 *grpEnd = (const U32 *)groupEnd;
359 1.1 christos /* The dmerId is how we will reference this dmer.
360 1.1 christos * This allows us to map the whole dmer space to a much smaller space, the
361 1.1 christos * size of the suffix array.
362 1.1 christos */
363 1.1 christos const U32 dmerId = (U32)(grpPtr - ctx->suffix);
364 1.1 christos /* Count the number of samples this dmer shows up in */
365 1.1 christos U32 freq = 0;
366 1.1 christos /* Details */
367 1.1 christos const size_t *curOffsetPtr = ctx->offsets;
368 1.1 christos const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
369 1.1 christos /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
370 1.1 christos * different sample than the last.
371 1.1 christos */
372 1.1 christos size_t curSampleEnd = ctx->offsets[0];
373 1.1 christos for (; grpPtr != grpEnd; ++grpPtr) {
374 1.1 christos /* Save the dmerId for this position so we can get back to it. */
375 1.1 christos ctx->dmerAt[*grpPtr] = dmerId;
376 1.1 christos /* Dictionaries only help for the first reference to the dmer.
377 1.1 christos * After that zstd can reference the match from the previous reference.
378 1.1 christos * So only count each dmer once for each sample it is in.
379 1.1 christos */
380 1.1 christos if (*grpPtr < curSampleEnd) {
381 1.1 christos continue;
382 1.1 christos }
383 1.1 christos freq += 1;
384 1.1 christos /* Binary search to find the end of the sample *grpPtr is in.
385 1.1 christos * In the common case that grpPtr + 1 == grpEnd we can skip the binary
386 1.1 christos * search because the loop is over.
387 1.1 christos */
388 1.1 christos if (grpPtr + 1 != grpEnd) {
389 1.1 christos const size_t *sampleEndPtr =
390 1.1 christos COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
391 1.1 christos curSampleEnd = *sampleEndPtr;
392 1.1 christos curOffsetPtr = sampleEndPtr + 1;
393 1.1 christos }
394 1.1 christos }
395 1.1 christos /* At this point we are never going to look at this segment of the suffix
396 1.1 christos * array again. We take advantage of this fact to save memory.
397 1.1 christos * We store the frequency of the dmer in the first position of the group,
398 1.1 christos * which is dmerId.
399 1.1 christos */
400 1.1 christos ctx->suffix[dmerId] = freq;
401 1.1 christos }
402 1.1 christos
403 1.1 christos
404 1.1 christos /**
405 1.1 christos * Selects the best segment in an epoch.
406 1.1 christos * Segments of are scored according to the function:
407 1.1 christos *
408 1.1 christos * Let F(d) be the frequency of dmer d.
409 1.1 christos * Let S_i be the dmer at position i of segment S which has length k.
410 1.1 christos *
411 1.1 christos * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
412 1.1 christos *
413 1.1 christos * Once the dmer d is in the dictionary we set F(d) = 0.
414 1.1 christos */
415 1.1 christos static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
416 1.1 christos COVER_map_t *activeDmers, U32 begin,
417 1.1 christos U32 end,
418 1.1 christos ZDICT_cover_params_t parameters) {
419 1.1 christos /* Constants */
420 1.1 christos const U32 k = parameters.k;
421 1.1 christos const U32 d = parameters.d;
422 1.1 christos const U32 dmersInK = k - d + 1;
423 1.1 christos /* Try each segment (activeSegment) and save the best (bestSegment) */
424 1.1 christos COVER_segment_t bestSegment = {0, 0, 0};
425 1.1 christos COVER_segment_t activeSegment;
426 1.1 christos /* Reset the activeDmers in the segment */
427 1.1 christos COVER_map_clear(activeDmers);
428 1.1 christos /* The activeSegment starts at the beginning of the epoch. */
429 1.1 christos activeSegment.begin = begin;
430 1.1 christos activeSegment.end = begin;
431 1.1 christos activeSegment.score = 0;
432 1.1 christos /* Slide the activeSegment through the whole epoch.
433 1.1 christos * Save the best segment in bestSegment.
434 1.1 christos */
435 1.1 christos while (activeSegment.end < end) {
436 1.1 christos /* The dmerId for the dmer at the next position */
437 1.1 christos U32 newDmer = ctx->dmerAt[activeSegment.end];
438 1.1 christos /* The entry in activeDmers for this dmerId */
439 1.1 christos U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
440 1.1 christos /* If the dmer isn't already present in the segment add its score. */
441 1.1 christos if (*newDmerOcc == 0) {
442 1.1 christos /* The paper suggest using the L-0.5 norm, but experiments show that it
443 1.1 christos * doesn't help.
444 1.1 christos */
445 1.1 christos activeSegment.score += freqs[newDmer];
446 1.1 christos }
447 1.1 christos /* Add the dmer to the segment */
448 1.1 christos activeSegment.end += 1;
449 1.1 christos *newDmerOcc += 1;
450 1.1 christos
451 1.1 christos /* If the window is now too large, drop the first position */
452 1.1 christos if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
453 1.1 christos U32 delDmer = ctx->dmerAt[activeSegment.begin];
454 1.1 christos U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
455 1.1 christos activeSegment.begin += 1;
456 1.1 christos *delDmerOcc -= 1;
457 1.1 christos /* If this is the last occurrence of the dmer, subtract its score */
458 1.1 christos if (*delDmerOcc == 0) {
459 1.1 christos COVER_map_remove(activeDmers, delDmer);
460 1.1 christos activeSegment.score -= freqs[delDmer];
461 1.1 christos }
462 1.1 christos }
463 1.1 christos
464 1.1 christos /* If this segment is the best so far save it */
465 1.1 christos if (activeSegment.score > bestSegment.score) {
466 1.1 christos bestSegment = activeSegment;
467 1.1 christos }
468 1.1 christos }
469 1.1 christos {
470 1.1 christos /* Trim off the zero frequency head and tail from the segment. */
471 1.1 christos U32 newBegin = bestSegment.end;
472 1.1 christos U32 newEnd = bestSegment.begin;
473 1.1 christos U32 pos;
474 1.1 christos for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
475 1.1 christos U32 freq = freqs[ctx->dmerAt[pos]];
476 1.1 christos if (freq != 0) {
477 1.1 christos newBegin = MIN(newBegin, pos);
478 1.1 christos newEnd = pos + 1;
479 1.1 christos }
480 1.1 christos }
481 1.1 christos bestSegment.begin = newBegin;
482 1.1 christos bestSegment.end = newEnd;
483 1.1 christos }
484 1.1 christos {
485 1.1 christos /* Zero out the frequency of each dmer covered by the chosen segment. */
486 1.1 christos U32 pos;
487 1.1 christos for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
488 1.1 christos freqs[ctx->dmerAt[pos]] = 0;
489 1.1 christos }
490 1.1 christos }
491 1.1 christos return bestSegment;
492 1.1 christos }
493 1.1 christos
494 1.1 christos /**
495 1.1 christos * Check the validity of the parameters.
496 1.1 christos * Returns non-zero if the parameters are valid and 0 otherwise.
497 1.1 christos */
498 1.1 christos static int COVER_checkParameters(ZDICT_cover_params_t parameters,
499 1.1 christos size_t maxDictSize) {
500 1.1 christos /* k and d are required parameters */
501 1.1 christos if (parameters.d == 0 || parameters.k == 0) {
502 1.1 christos return 0;
503 1.1 christos }
504 1.1 christos /* k <= maxDictSize */
505 1.1 christos if (parameters.k > maxDictSize) {
506 1.1 christos return 0;
507 1.1 christos }
508 1.1 christos /* d <= k */
509 1.1 christos if (parameters.d > parameters.k) {
510 1.1 christos return 0;
511 1.1 christos }
512 1.1 christos /* 0 < splitPoint <= 1 */
513 1.1 christos if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
514 1.1 christos return 0;
515 1.1 christos }
516 1.1 christos return 1;
517 1.1 christos }
518 1.1 christos
519 1.1 christos /**
520 1.1 christos * Clean up a context initialized with `COVER_ctx_init()`.
521 1.1 christos */
522 1.1 christos static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
523 1.1 christos if (!ctx) {
524 1.1 christos return;
525 1.1 christos }
526 1.1 christos if (ctx->suffix) {
527 1.1 christos free(ctx->suffix);
528 1.1 christos ctx->suffix = NULL;
529 1.1 christos }
530 1.1 christos if (ctx->freqs) {
531 1.1 christos free(ctx->freqs);
532 1.1 christos ctx->freqs = NULL;
533 1.1 christos }
534 1.1 christos if (ctx->dmerAt) {
535 1.1 christos free(ctx->dmerAt);
536 1.1 christos ctx->dmerAt = NULL;
537 1.1 christos }
538 1.1 christos if (ctx->offsets) {
539 1.1 christos free(ctx->offsets);
540 1.1 christos ctx->offsets = NULL;
541 1.1 christos }
542 1.1 christos }
543 1.1 christos
544 1.1 christos /**
545 1.1 christos * Prepare a context for dictionary building.
546 1.1 christos * The context is only dependent on the parameter `d` and can be used multiple
547 1.1 christos * times.
548 1.1 christos * Returns 0 on success or error code on error.
549 1.1 christos * The context must be destroyed with `COVER_ctx_destroy()`.
550 1.1 christos */
551 1.1 christos static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
552 1.1 christos const size_t *samplesSizes, unsigned nbSamples,
553 1.1 christos unsigned d, double splitPoint)
554 1.1 christos {
555 1.1 christos const BYTE *const samples = (const BYTE *)samplesBuffer;
556 1.1 christos const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
557 1.1 christos /* Split samples into testing and training sets */
558 1.1 christos const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
559 1.1 christos const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
560 1.1 christos const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
561 1.1 christos const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
562 1.1 christos /* Checks */
563 1.1 christos if (totalSamplesSize < MAX(d, sizeof(U64)) ||
564 1.1 christos totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
565 1.1 christos DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
566 1.1 christos (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
567 1.1 christos return ERROR(srcSize_wrong);
568 1.1 christos }
569 1.1 christos /* Check if there are at least 5 training samples */
570 1.1 christos if (nbTrainSamples < 5) {
571 1.1 christos DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
572 1.1 christos return ERROR(srcSize_wrong);
573 1.1 christos }
574 1.1 christos /* Check if there's testing sample */
575 1.1 christos if (nbTestSamples < 1) {
576 1.1 christos DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
577 1.1 christos return ERROR(srcSize_wrong);
578 1.1 christos }
579 1.1 christos /* Zero the context */
580 1.1 christos memset(ctx, 0, sizeof(*ctx));
581 1.1 christos DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
582 1.1 christos (unsigned)trainingSamplesSize);
583 1.1 christos DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
584 1.1 christos (unsigned)testSamplesSize);
585 1.1 christos ctx->samples = samples;
586 1.1 christos ctx->samplesSizes = samplesSizes;
587 1.1 christos ctx->nbSamples = nbSamples;
588 1.1 christos ctx->nbTrainSamples = nbTrainSamples;
589 1.1 christos ctx->nbTestSamples = nbTestSamples;
590 1.1 christos /* Partial suffix array */
591 1.1 christos ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
592 1.1 christos ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
593 1.1 christos /* Maps index to the dmerID */
594 1.1 christos ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
595 1.1 christos /* The offsets of each file */
596 1.1 christos ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
597 1.1 christos if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
598 1.1 christos DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
599 1.1 christos COVER_ctx_destroy(ctx);
600 1.1 christos return ERROR(memory_allocation);
601 1.1 christos }
602 1.1 christos ctx->freqs = NULL;
603 1.1 christos ctx->d = d;
604 1.1 christos
605 1.1 christos /* Fill offsets from the samplesSizes */
606 1.1 christos {
607 1.1 christos U32 i;
608 1.1 christos ctx->offsets[0] = 0;
609 1.1 christos for (i = 1; i <= nbSamples; ++i) {
610 1.1 christos ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
611 1.1 christos }
612 1.1 christos }
613 1.1 christos DISPLAYLEVEL(2, "Constructing partial suffix array\n");
614 1.1 christos {
615 1.1 christos /* suffix is a partial suffix array.
616 1.1 christos * It only sorts suffixes by their first parameters.d bytes.
617 1.1 christos * The sort is stable, so each dmer group is sorted by position in input.
618 1.1 christos */
619 1.1 christos U32 i;
620 1.1 christos for (i = 0; i < ctx->suffixSize; ++i) {
621 1.1 christos ctx->suffix[i] = i;
622 1.1 christos }
623 1.1 christos /* qsort doesn't take an opaque pointer, so pass as a global.
624 1.1 christos * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
625 1.1 christos */
626 1.1 christos g_coverCtx = ctx;
627 1.1 christos #if defined(__OpenBSD__)
628 1.1 christos mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
629 1.1 christos (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
630 1.1 christos #else
631 1.1 christos qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
632 1.1 christos (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
633 1.1 christos #endif
634 1.1 christos }
635 1.1 christos DISPLAYLEVEL(2, "Computing frequencies\n");
636 1.1 christos /* For each dmer group (group of positions with the same first d bytes):
637 1.1 christos * 1. For each position we set dmerAt[position] = dmerID. The dmerID is
638 1.1 christos * (groupBeginPtr - suffix). This allows us to go from position to
639 1.1 christos * dmerID so we can look up values in freq.
640 1.1 christos * 2. We calculate how many samples the dmer occurs in and save it in
641 1.1 christos * freqs[dmerId].
642 1.1 christos */
643 1.1 christos COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
644 1.1 christos (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
645 1.1 christos ctx->freqs = ctx->suffix;
646 1.1 christos ctx->suffix = NULL;
647 1.1 christos return 0;
648 1.1 christos }
649 1.1 christos
650 1.1 christos void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
651 1.1 christos {
652 1.1 christos const double ratio = (double)nbDmers / (double)maxDictSize;
653 1.1 christos if (ratio >= 10) {
654 1.1 christos return;
655 1.1 christos }
656 1.1 christos LOCALDISPLAYLEVEL(displayLevel, 1,
657 1.1 christos "WARNING: The maximum dictionary size %u is too large "
658 1.1 christos "compared to the source size %u! "
659 1.1 christos "size(source)/size(dictionary) = %f, but it should be >= "
660 1.1 christos "10! This may lead to a subpar dictionary! We recommend "
661 1.1 christos "training on sources at least 10x, and preferably 100x "
662 1.1 christos "the size of the dictionary! \n", (U32)maxDictSize,
663 1.1 christos (U32)nbDmers, ratio);
664 1.1 christos }
665 1.1 christos
666 1.1 christos COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
667 1.1 christos U32 nbDmers, U32 k, U32 passes)
668 1.1 christos {
669 1.1 christos const U32 minEpochSize = k * 10;
670 1.1 christos COVER_epoch_info_t epochs;
671 1.1 christos epochs.num = MAX(1, maxDictSize / k / passes);
672 1.1 christos epochs.size = nbDmers / epochs.num;
673 1.1 christos if (epochs.size >= minEpochSize) {
674 1.1 christos assert(epochs.size * epochs.num <= nbDmers);
675 1.1 christos return epochs;
676 1.1 christos }
677 1.1 christos epochs.size = MIN(minEpochSize, nbDmers);
678 1.1 christos epochs.num = nbDmers / epochs.size;
679 1.1 christos assert(epochs.size * epochs.num <= nbDmers);
680 1.1 christos return epochs;
681 1.1 christos }
682 1.1 christos
683 1.1 christos /**
684 1.1 christos * Given the prepared context build the dictionary.
685 1.1 christos */
686 1.1 christos static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
687 1.1 christos COVER_map_t *activeDmers, void *dictBuffer,
688 1.1 christos size_t dictBufferCapacity,
689 1.1 christos ZDICT_cover_params_t parameters) {
690 1.1 christos BYTE *const dict = (BYTE *)dictBuffer;
691 1.1 christos size_t tail = dictBufferCapacity;
692 1.1 christos /* Divide the data into epochs. We will select one segment from each epoch. */
693 1.1 christos const COVER_epoch_info_t epochs = COVER_computeEpochs(
694 1.1 christos (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
695 1.1 christos const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
696 1.1 christos size_t zeroScoreRun = 0;
697 1.1 christos size_t epoch;
698 1.1 christos DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
699 1.1 christos (U32)epochs.num, (U32)epochs.size);
700 1.1 christos /* Loop through the epochs until there are no more segments or the dictionary
701 1.1 christos * is full.
702 1.1 christos */
703 1.1 christos for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
704 1.1 christos const U32 epochBegin = (U32)(epoch * epochs.size);
705 1.1 christos const U32 epochEnd = epochBegin + epochs.size;
706 1.1 christos size_t segmentSize;
707 1.1 christos /* Select a segment */
708 1.1 christos COVER_segment_t segment = COVER_selectSegment(
709 1.1 christos ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
710 1.1 christos /* If the segment covers no dmers, then we are out of content.
711 1.1 christos * There may be new content in other epochs, for continue for some time.
712 1.1 christos */
713 1.1 christos if (segment.score == 0) {
714 1.1 christos if (++zeroScoreRun >= maxZeroScoreRun) {
715 1.1 christos break;
716 1.1 christos }
717 1.1 christos continue;
718 1.1 christos }
719 1.1 christos zeroScoreRun = 0;
720 1.1 christos /* Trim the segment if necessary and if it is too small then we are done */
721 1.1 christos segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
722 1.1 christos if (segmentSize < parameters.d) {
723 1.1 christos break;
724 1.1 christos }
725 1.1 christos /* We fill the dictionary from the back to allow the best segments to be
726 1.1 christos * referenced with the smallest offsets.
727 1.1 christos */
728 1.1 christos tail -= segmentSize;
729 1.1 christos memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
730 1.1 christos DISPLAYUPDATE(
731 1.1 christos 2, "\r%u%% ",
732 1.1 christos (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
733 1.1 christos }
734 1.1 christos DISPLAYLEVEL(2, "\r%79s\r", "");
735 1.1 christos return tail;
736 1.1 christos }
737 1.1 christos
738 1.1 christos ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(
739 1.1 christos void *dictBuffer, size_t dictBufferCapacity,
740 1.1 christos const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
741 1.1 christos ZDICT_cover_params_t parameters)
742 1.1 christos {
743 1.1 christos BYTE* const dict = (BYTE*)dictBuffer;
744 1.1 christos COVER_ctx_t ctx;
745 1.1 christos COVER_map_t activeDmers;
746 1.1 christos parameters.splitPoint = 1.0;
747 1.1 christos /* Initialize global data */
748 1.1 christos g_displayLevel = (int)parameters.zParams.notificationLevel;
749 1.1 christos /* Checks */
750 1.1 christos if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
751 1.1 christos DISPLAYLEVEL(1, "Cover parameters incorrect\n");
752 1.1 christos return ERROR(parameter_outOfBound);
753 1.1 christos }
754 1.1 christos if (nbSamples == 0) {
755 1.1 christos DISPLAYLEVEL(1, "Cover must have at least one input file\n");
756 1.1 christos return ERROR(srcSize_wrong);
757 1.1 christos }
758 1.1 christos if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
759 1.1 christos DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
760 1.1 christos ZDICT_DICTSIZE_MIN);
761 1.1 christos return ERROR(dstSize_tooSmall);
762 1.1 christos }
763 1.1 christos /* Initialize context and activeDmers */
764 1.1 christos {
765 1.1 christos size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
766 1.1 christos parameters.d, parameters.splitPoint);
767 1.1 christos if (ZSTD_isError(initVal)) {
768 1.1 christos return initVal;
769 1.1 christos }
770 1.1 christos }
771 1.1 christos COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
772 1.1 christos if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
773 1.1 christos DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
774 1.1 christos COVER_ctx_destroy(&ctx);
775 1.1 christos return ERROR(memory_allocation);
776 1.1 christos }
777 1.1 christos
778 1.1 christos DISPLAYLEVEL(2, "Building dictionary\n");
779 1.1 christos {
780 1.1 christos const size_t tail =
781 1.1 christos COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
782 1.1 christos dictBufferCapacity, parameters);
783 1.1 christos const size_t dictionarySize = ZDICT_finalizeDictionary(
784 1.1 christos dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
785 1.1 christos samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
786 1.1 christos if (!ZSTD_isError(dictionarySize)) {
787 1.1 christos DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
788 1.1 christos (unsigned)dictionarySize);
789 1.1 christos }
790 1.1 christos COVER_ctx_destroy(&ctx);
791 1.1 christos COVER_map_destroy(&activeDmers);
792 1.1 christos return dictionarySize;
793 1.1 christos }
794 1.1 christos }
795 1.1 christos
796 1.1 christos
797 1.1 christos
798 1.1 christos size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
799 1.1 christos const size_t *samplesSizes, const BYTE *samples,
800 1.1 christos size_t *offsets,
801 1.1 christos size_t nbTrainSamples, size_t nbSamples,
802 1.1 christos BYTE *const dict, size_t dictBufferCapacity) {
803 1.1 christos size_t totalCompressedSize = ERROR(GENERIC);
804 1.1 christos /* Pointers */
805 1.1 christos ZSTD_CCtx *cctx;
806 1.1 christos ZSTD_CDict *cdict;
807 1.1 christos void *dst;
808 1.1 christos /* Local variables */
809 1.1 christos size_t dstCapacity;
810 1.1 christos size_t i;
811 1.1 christos /* Allocate dst with enough space to compress the maximum sized sample */
812 1.1 christos {
813 1.1 christos size_t maxSampleSize = 0;
814 1.1 christos i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
815 1.1 christos for (; i < nbSamples; ++i) {
816 1.1 christos maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
817 1.1 christos }
818 1.1 christos dstCapacity = ZSTD_compressBound(maxSampleSize);
819 1.1 christos dst = malloc(dstCapacity);
820 1.1 christos }
821 1.1 christos /* Create the cctx and cdict */
822 1.1 christos cctx = ZSTD_createCCtx();
823 1.1 christos cdict = ZSTD_createCDict(dict, dictBufferCapacity,
824 1.1 christos parameters.zParams.compressionLevel);
825 1.1 christos if (!dst || !cctx || !cdict) {
826 1.1 christos goto _compressCleanup;
827 1.1 christos }
828 1.1 christos /* Compress each sample and sum their sizes (or error) */
829 1.1 christos totalCompressedSize = dictBufferCapacity;
830 1.1 christos i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
831 1.1 christos for (; i < nbSamples; ++i) {
832 1.1 christos const size_t size = ZSTD_compress_usingCDict(
833 1.1 christos cctx, dst, dstCapacity, samples + offsets[i],
834 1.1 christos samplesSizes[i], cdict);
835 1.1 christos if (ZSTD_isError(size)) {
836 1.1 christos totalCompressedSize = size;
837 1.1 christos goto _compressCleanup;
838 1.1 christos }
839 1.1 christos totalCompressedSize += size;
840 1.1 christos }
841 1.1 christos _compressCleanup:
842 1.1 christos ZSTD_freeCCtx(cctx);
843 1.1 christos ZSTD_freeCDict(cdict);
844 1.1 christos if (dst) {
845 1.1 christos free(dst);
846 1.1 christos }
847 1.1 christos return totalCompressedSize;
848 1.1 christos }
849 1.1 christos
850 1.1 christos
851 1.1 christos /**
852 1.1 christos * Initialize the `COVER_best_t`.
853 1.1 christos */
854 1.1 christos void COVER_best_init(COVER_best_t *best) {
855 1.1 christos if (best==NULL) return; /* compatible with init on NULL */
856 1.1 christos (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
857 1.1 christos (void)ZSTD_pthread_cond_init(&best->cond, NULL);
858 1.1 christos best->liveJobs = 0;
859 1.1 christos best->dict = NULL;
860 1.1 christos best->dictSize = 0;
861 1.1 christos best->compressedSize = (size_t)-1;
862 1.1 christos memset(&best->parameters, 0, sizeof(best->parameters));
863 1.1 christos }
864 1.1 christos
865 1.1 christos /**
866 1.1 christos * Wait until liveJobs == 0.
867 1.1 christos */
868 1.1 christos void COVER_best_wait(COVER_best_t *best) {
869 1.1 christos if (!best) {
870 1.1 christos return;
871 1.1 christos }
872 1.1 christos ZSTD_pthread_mutex_lock(&best->mutex);
873 1.1 christos while (best->liveJobs != 0) {
874 1.1 christos ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
875 1.1 christos }
876 1.1 christos ZSTD_pthread_mutex_unlock(&best->mutex);
877 1.1 christos }
878 1.1 christos
879 1.1 christos /**
880 1.1 christos * Call COVER_best_wait() and then destroy the COVER_best_t.
881 1.1 christos */
882 1.1 christos void COVER_best_destroy(COVER_best_t *best) {
883 1.1 christos if (!best) {
884 1.1 christos return;
885 1.1 christos }
886 1.1 christos COVER_best_wait(best);
887 1.1 christos if (best->dict) {
888 1.1 christos free(best->dict);
889 1.1 christos }
890 1.1 christos ZSTD_pthread_mutex_destroy(&best->mutex);
891 1.1 christos ZSTD_pthread_cond_destroy(&best->cond);
892 1.1 christos }
893 1.1 christos
894 1.1 christos /**
895 1.1 christos * Called when a thread is about to be launched.
896 1.1 christos * Increments liveJobs.
897 1.1 christos */
898 1.1 christos void COVER_best_start(COVER_best_t *best) {
899 1.1 christos if (!best) {
900 1.1 christos return;
901 1.1 christos }
902 1.1 christos ZSTD_pthread_mutex_lock(&best->mutex);
903 1.1 christos ++best->liveJobs;
904 1.1 christos ZSTD_pthread_mutex_unlock(&best->mutex);
905 1.1 christos }
906 1.1 christos
907 1.1 christos /**
908 1.1 christos * Called when a thread finishes executing, both on error or success.
909 1.1 christos * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
910 1.1 christos * If this dictionary is the best so far save it and its parameters.
911 1.1 christos */
912 1.1 christos void COVER_best_finish(COVER_best_t* best,
913 1.1 christos ZDICT_cover_params_t parameters,
914 1.1 christos COVER_dictSelection_t selection)
915 1.1 christos {
916 1.1 christos void* dict = selection.dictContent;
917 1.1 christos size_t compressedSize = selection.totalCompressedSize;
918 1.1 christos size_t dictSize = selection.dictSize;
919 1.1 christos if (!best) {
920 1.1 christos return;
921 1.1 christos }
922 1.1 christos {
923 1.1 christos size_t liveJobs;
924 1.1 christos ZSTD_pthread_mutex_lock(&best->mutex);
925 1.1 christos --best->liveJobs;
926 1.1 christos liveJobs = best->liveJobs;
927 1.1 christos /* If the new dictionary is better */
928 1.1 christos if (compressedSize < best->compressedSize) {
929 1.1 christos /* Allocate space if necessary */
930 1.1 christos if (!best->dict || best->dictSize < dictSize) {
931 1.1 christos if (best->dict) {
932 1.1 christos free(best->dict);
933 1.1 christos }
934 1.1 christos best->dict = malloc(dictSize);
935 1.1 christos if (!best->dict) {
936 1.1 christos best->compressedSize = ERROR(GENERIC);
937 1.1 christos best->dictSize = 0;
938 1.1 christos ZSTD_pthread_cond_signal(&best->cond);
939 1.1 christos ZSTD_pthread_mutex_unlock(&best->mutex);
940 1.1 christos return;
941 1.1 christos }
942 1.1 christos }
943 1.1 christos /* Save the dictionary, parameters, and size */
944 1.1 christos if (dict) {
945 1.1 christos memcpy(best->dict, dict, dictSize);
946 1.1 christos best->dictSize = dictSize;
947 1.1 christos best->parameters = parameters;
948 1.1 christos best->compressedSize = compressedSize;
949 1.1 christos }
950 1.1 christos }
951 1.1 christos if (liveJobs == 0) {
952 1.1 christos ZSTD_pthread_cond_broadcast(&best->cond);
953 1.1 christos }
954 1.1 christos ZSTD_pthread_mutex_unlock(&best->mutex);
955 1.1 christos }
956 1.1 christos }
957 1.1 christos
958 1.1 christos static COVER_dictSelection_t setDictSelection(BYTE* buf, size_t s, size_t csz)
959 1.1 christos {
960 1.1 christos COVER_dictSelection_t ds;
961 1.1 christos ds.dictContent = buf;
962 1.1 christos ds.dictSize = s;
963 1.1 christos ds.totalCompressedSize = csz;
964 1.1 christos return ds;
965 1.1 christos }
966 1.1 christos
967 1.1 christos COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
968 1.1 christos return setDictSelection(NULL, 0, error);
969 1.1 christos }
970 1.1 christos
971 1.1 christos unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
972 1.1 christos return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
973 1.1 christos }
974 1.1 christos
975 1.1 christos void COVER_dictSelectionFree(COVER_dictSelection_t selection){
976 1.1 christos free(selection.dictContent);
977 1.1 christos }
978 1.1 christos
979 1.1 christos COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
980 1.1 christos size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
981 1.1 christos size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
982 1.1 christos
983 1.1 christos size_t largestDict = 0;
984 1.1 christos size_t largestCompressed = 0;
985 1.1 christos BYTE* customDictContentEnd = customDictContent + dictContentSize;
986 1.1 christos
987 1.1 christos BYTE* largestDictbuffer = (BYTE*)malloc(dictBufferCapacity);
988 1.1 christos BYTE* candidateDictBuffer = (BYTE*)malloc(dictBufferCapacity);
989 1.1 christos double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
990 1.1 christos
991 1.1 christos if (!largestDictbuffer || !candidateDictBuffer) {
992 1.1 christos free(largestDictbuffer);
993 1.1 christos free(candidateDictBuffer);
994 1.1 christos return COVER_dictSelectionError(dictContentSize);
995 1.1 christos }
996 1.1 christos
997 1.1 christos /* Initial dictionary size and compressed size */
998 1.1 christos memcpy(largestDictbuffer, customDictContent, dictContentSize);
999 1.1 christos dictContentSize = ZDICT_finalizeDictionary(
1000 1.1 christos largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,
1001 1.1 christos samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
1002 1.1 christos
1003 1.1 christos if (ZDICT_isError(dictContentSize)) {
1004 1.1 christos free(largestDictbuffer);
1005 1.1 christos free(candidateDictBuffer);
1006 1.1 christos return COVER_dictSelectionError(dictContentSize);
1007 1.1 christos }
1008 1.1 christos
1009 1.1 christos totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
1010 1.1 christos samplesBuffer, offsets,
1011 1.1 christos nbCheckSamples, nbSamples,
1012 1.1 christos largestDictbuffer, dictContentSize);
1013 1.1 christos
1014 1.1 christos if (ZSTD_isError(totalCompressedSize)) {
1015 1.1 christos free(largestDictbuffer);
1016 1.1 christos free(candidateDictBuffer);
1017 1.1 christos return COVER_dictSelectionError(totalCompressedSize);
1018 1.1 christos }
1019 1.1 christos
1020 1.1 christos if (params.shrinkDict == 0) {
1021 1.1 christos free(candidateDictBuffer);
1022 1.1 christos return setDictSelection(largestDictbuffer, dictContentSize, totalCompressedSize);
1023 1.1 christos }
1024 1.1 christos
1025 1.1 christos largestDict = dictContentSize;
1026 1.1 christos largestCompressed = totalCompressedSize;
1027 1.1 christos dictContentSize = ZDICT_DICTSIZE_MIN;
1028 1.1 christos
1029 1.1 christos /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
1030 1.1 christos while (dictContentSize < largestDict) {
1031 1.1 christos memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
1032 1.1 christos dictContentSize = ZDICT_finalizeDictionary(
1033 1.1 christos candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,
1034 1.1 christos samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
1035 1.1 christos
1036 1.1 christos if (ZDICT_isError(dictContentSize)) {
1037 1.1 christos free(largestDictbuffer);
1038 1.1 christos free(candidateDictBuffer);
1039 1.1 christos return COVER_dictSelectionError(dictContentSize);
1040 1.1 christos
1041 1.1 christos }
1042 1.1 christos
1043 1.1 christos totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
1044 1.1 christos samplesBuffer, offsets,
1045 1.1 christos nbCheckSamples, nbSamples,
1046 1.1 christos candidateDictBuffer, dictContentSize);
1047 1.1 christos
1048 1.1 christos if (ZSTD_isError(totalCompressedSize)) {
1049 1.1 christos free(largestDictbuffer);
1050 1.1 christos free(candidateDictBuffer);
1051 1.1 christos return COVER_dictSelectionError(totalCompressedSize);
1052 1.1 christos }
1053 1.1 christos
1054 1.1 christos if ((double)totalCompressedSize <= (double)largestCompressed * regressionTolerance) {
1055 1.1 christos free(largestDictbuffer);
1056 1.1 christos return setDictSelection( candidateDictBuffer, dictContentSize, totalCompressedSize );
1057 1.1 christos }
1058 1.1 christos dictContentSize *= 2;
1059 1.1 christos }
1060 1.1 christos dictContentSize = largestDict;
1061 1.1 christos totalCompressedSize = largestCompressed;
1062 1.1 christos free(candidateDictBuffer);
1063 1.1 christos return setDictSelection( largestDictbuffer, dictContentSize, totalCompressedSize );
1064 1.1 christos }
1065 1.1 christos
1066 1.1 christos /**
1067 1.1 christos * Parameters for COVER_tryParameters().
1068 1.1 christos */
1069 1.1 christos typedef struct COVER_tryParameters_data_s {
1070 1.1 christos const COVER_ctx_t *ctx;
1071 1.1 christos COVER_best_t *best;
1072 1.1 christos size_t dictBufferCapacity;
1073 1.1 christos ZDICT_cover_params_t parameters;
1074 1.1 christos } COVER_tryParameters_data_t;
1075 1.1 christos
1076 1.1 christos /**
1077 1.1 christos * Tries a set of parameters and updates the COVER_best_t with the results.
1078 1.1 christos * This function is thread safe if zstd is compiled with multithreaded support.
1079 1.1 christos * It takes its parameters as an *OWNING* opaque pointer to support threading.
1080 1.1 christos */
1081 1.1 christos static void COVER_tryParameters(void *opaque)
1082 1.1 christos {
1083 1.1 christos /* Save parameters as local variables */
1084 1.1 christos COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque;
1085 1.1 christos const COVER_ctx_t *const ctx = data->ctx;
1086 1.1 christos const ZDICT_cover_params_t parameters = data->parameters;
1087 1.1 christos size_t dictBufferCapacity = data->dictBufferCapacity;
1088 1.1 christos size_t totalCompressedSize = ERROR(GENERIC);
1089 1.1 christos /* Allocate space for hash table, dict, and freqs */
1090 1.1 christos COVER_map_t activeDmers;
1091 1.1 christos BYTE* const dict = (BYTE*)malloc(dictBufferCapacity);
1092 1.1 christos COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
1093 1.1 christos U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32));
1094 1.1 christos if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
1095 1.1 christos DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
1096 1.1 christos goto _cleanup;
1097 1.1 christos }
1098 1.1 christos if (!dict || !freqs) {
1099 1.1 christos DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
1100 1.1 christos goto _cleanup;
1101 1.1 christos }
1102 1.1 christos /* Copy the frequencies because we need to modify them */
1103 1.1 christos memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
1104 1.1 christos /* Build the dictionary */
1105 1.1 christos {
1106 1.1 christos const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
1107 1.1 christos dictBufferCapacity, parameters);
1108 1.1 christos selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
1109 1.1 christos ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
1110 1.1 christos totalCompressedSize);
1111 1.1 christos
1112 1.1 christos if (COVER_dictSelectionIsError(selection)) {
1113 1.1 christos DISPLAYLEVEL(1, "Failed to select dictionary\n");
1114 1.1 christos goto _cleanup;
1115 1.1 christos }
1116 1.1 christos }
1117 1.1 christos _cleanup:
1118 1.1 christos free(dict);
1119 1.1 christos COVER_best_finish(data->best, parameters, selection);
1120 1.1 christos free(data);
1121 1.1 christos COVER_map_destroy(&activeDmers);
1122 1.1 christos COVER_dictSelectionFree(selection);
1123 1.1 christos free(freqs);
1124 1.1 christos }
1125 1.1 christos
1126 1.1 christos ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(
1127 1.1 christos void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer,
1128 1.1 christos const size_t* samplesSizes, unsigned nbSamples,
1129 1.1 christos ZDICT_cover_params_t* parameters)
1130 1.1 christos {
1131 1.1 christos /* constants */
1132 1.1 christos const unsigned nbThreads = parameters->nbThreads;
1133 1.1 christos const double splitPoint =
1134 1.1 christos parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
1135 1.1 christos const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
1136 1.1 christos const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
1137 1.1 christos const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
1138 1.1 christos const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
1139 1.1 christos const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
1140 1.1 christos const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
1141 1.1 christos const unsigned kIterations =
1142 1.1 christos (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
1143 1.1 christos const unsigned shrinkDict = 0;
1144 1.1 christos /* Local variables */
1145 1.1 christos const int displayLevel = parameters->zParams.notificationLevel;
1146 1.1 christos unsigned iteration = 1;
1147 1.1 christos unsigned d;
1148 1.1 christos unsigned k;
1149 1.1 christos COVER_best_t best;
1150 1.1 christos POOL_ctx *pool = NULL;
1151 1.1 christos int warned = 0;
1152 1.1 christos
1153 1.1 christos /* Checks */
1154 1.1 christos if (splitPoint <= 0 || splitPoint > 1) {
1155 1.1 christos LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
1156 1.1 christos return ERROR(parameter_outOfBound);
1157 1.1 christos }
1158 1.1 christos if (kMinK < kMaxD || kMaxK < kMinK) {
1159 1.1 christos LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
1160 1.1 christos return ERROR(parameter_outOfBound);
1161 1.1 christos }
1162 1.1 christos if (nbSamples == 0) {
1163 1.1 christos DISPLAYLEVEL(1, "Cover must have at least one input file\n");
1164 1.1 christos return ERROR(srcSize_wrong);
1165 1.1 christos }
1166 1.1 christos if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
1167 1.1 christos DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
1168 1.1 christos ZDICT_DICTSIZE_MIN);
1169 1.1 christos return ERROR(dstSize_tooSmall);
1170 1.1 christos }
1171 1.1 christos if (nbThreads > 1) {
1172 1.1 christos pool = POOL_create(nbThreads, 1);
1173 1.1 christos if (!pool) {
1174 1.1 christos return ERROR(memory_allocation);
1175 1.1 christos }
1176 1.1 christos }
1177 1.1 christos /* Initialization */
1178 1.1 christos COVER_best_init(&best);
1179 1.1 christos /* Turn down global display level to clean up display at level 2 and below */
1180 1.1 christos g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
1181 1.1 christos /* Loop through d first because each new value needs a new context */
1182 1.1 christos LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
1183 1.1 christos kIterations);
1184 1.1 christos for (d = kMinD; d <= kMaxD; d += 2) {
1185 1.1 christos /* Initialize the context for this value of d */
1186 1.1 christos COVER_ctx_t ctx;
1187 1.1 christos LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
1188 1.1 christos {
1189 1.1 christos const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
1190 1.1 christos if (ZSTD_isError(initVal)) {
1191 1.1 christos LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
1192 1.1 christos COVER_best_destroy(&best);
1193 1.1 christos POOL_free(pool);
1194 1.1 christos return initVal;
1195 1.1 christos }
1196 1.1 christos }
1197 1.1 christos if (!warned) {
1198 1.1 christos COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
1199 1.1 christos warned = 1;
1200 1.1 christos }
1201 1.1 christos /* Loop through k reusing the same context */
1202 1.1 christos for (k = kMinK; k <= kMaxK; k += kStepSize) {
1203 1.1 christos /* Prepare the arguments */
1204 1.1 christos COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
1205 1.1 christos sizeof(COVER_tryParameters_data_t));
1206 1.1 christos LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
1207 1.1 christos if (!data) {
1208 1.1 christos LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
1209 1.1 christos COVER_best_destroy(&best);
1210 1.1 christos COVER_ctx_destroy(&ctx);
1211 1.1 christos POOL_free(pool);
1212 1.1 christos return ERROR(memory_allocation);
1213 1.1 christos }
1214 1.1 christos data->ctx = &ctx;
1215 1.1 christos data->best = &best;
1216 1.1 christos data->dictBufferCapacity = dictBufferCapacity;
1217 1.1 christos data->parameters = *parameters;
1218 1.1 christos data->parameters.k = k;
1219 1.1 christos data->parameters.d = d;
1220 1.1 christos data->parameters.splitPoint = splitPoint;
1221 1.1 christos data->parameters.steps = kSteps;
1222 1.1 christos data->parameters.shrinkDict = shrinkDict;
1223 1.1 christos data->parameters.zParams.notificationLevel = g_displayLevel;
1224 1.1 christos /* Check the parameters */
1225 1.1 christos if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
1226 1.1 christos DISPLAYLEVEL(1, "Cover parameters incorrect\n");
1227 1.1 christos free(data);
1228 1.1 christos continue;
1229 1.1 christos }
1230 1.1 christos /* Call the function and pass ownership of data to it */
1231 1.1 christos COVER_best_start(&best);
1232 1.1 christos if (pool) {
1233 1.1 christos POOL_add(pool, &COVER_tryParameters, data);
1234 1.1 christos } else {
1235 1.1 christos COVER_tryParameters(data);
1236 1.1 christos }
1237 1.1 christos /* Print status */
1238 1.1 christos LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
1239 1.1 christos (unsigned)((iteration * 100) / kIterations));
1240 1.1 christos ++iteration;
1241 1.1 christos }
1242 1.1 christos COVER_best_wait(&best);
1243 1.1 christos COVER_ctx_destroy(&ctx);
1244 1.1 christos }
1245 1.1 christos LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
1246 1.1 christos /* Fill the output buffer and parameters with output of the best parameters */
1247 1.1 christos {
1248 1.1 christos const size_t dictSize = best.dictSize;
1249 1.1 christos if (ZSTD_isError(best.compressedSize)) {
1250 1.1 christos const size_t compressedSize = best.compressedSize;
1251 1.1 christos COVER_best_destroy(&best);
1252 1.1 christos POOL_free(pool);
1253 1.1 christos return compressedSize;
1254 1.1 christos }
1255 1.1 christos *parameters = best.parameters;
1256 1.1 christos memcpy(dictBuffer, best.dict, dictSize);
1257 1.1 christos COVER_best_destroy(&best);
1258 1.1 christos POOL_free(pool);
1259 1.1 christos return dictSize;
1260 1.1 christos }
1261 1.1 christos }
1262