1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 #include <limits.h> 12 #include <math.h> 13 #include <stddef.h> 14 #include <stdio.h> 15 #include <stdlib.h> 16 #include <string.h> 17 #include <time.h> /* time(), for seed random initialization */ 18 19 #include "util.h" 20 #include "timefn.h" /* UTIL_clockSpanMicro, SEC_TO_MICRO, UTIL_TIME_INITIALIZER */ 21 #include "zstd.h" 22 #include "zstd_internal.h" 23 #include "mem.h" 24 #define ZDICT_STATIC_LINKING_ONLY 25 #include "zdict.h" 26 27 /* Direct access to internal compression functions is required */ 28 #include "compress/zstd_compress.c" /* ZSTD_resetSeqStore, ZSTD_storeSeq, *_TO_OFFBASE, HIST_countFast_wksp, HIST_isError */ 29 #include "decompress/zstd_decompress_block.h" /* ZSTD_decompressBlock_deprecated */ 30 31 #define XXH_STATIC_LINKING_ONLY 32 #include "xxhash.h" /* XXH64 */ 33 34 #if !(defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)) 35 # define inline /* disable */ 36 #endif 37 38 /*-************************************ 39 * DISPLAY Macros 40 **************************************/ 41 #define DISPLAY(...) fprintf(stderr, __VA_ARGS__) 42 #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } 43 static U32 g_displayLevel = 2; 44 45 #define DISPLAYUPDATE(...) \ 46 do { \ 47 if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || \ 48 (g_displayLevel >= 4)) { \ 49 g_displayClock = UTIL_getTime(); \ 50 DISPLAY(__VA_ARGS__); \ 51 if (g_displayLevel >= 4) fflush(stderr); \ 52 } \ 53 } while (0) 54 55 static const U64 g_refreshRate = SEC_TO_MICRO / 6; 56 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; 57 58 #define CHECKERR(code) \ 59 do { \ 60 if (ZSTD_isError(code)) { \ 61 DISPLAY("Error occurred while generating data: %s\n", \ 62 ZSTD_getErrorName(code)); \ 63 exit(1); \ 64 } \ 65 } while (0) 66 67 68 /*-******************************************************* 69 * Random function 70 *********************************************************/ 71 static U32 RAND(U32* src) 72 { 73 #define RAND_rotl32(x,r) ((x << r) | (x >> (32 - r))) 74 static const U32 prime1 = 2654435761U; 75 static const U32 prime2 = 2246822519U; 76 U32 rand32 = *src; 77 rand32 *= prime1; 78 rand32 += prime2; 79 rand32 = RAND_rotl32(rand32, 13); 80 *src = rand32; 81 return RAND_rotl32(rand32, 27); 82 #undef RAND_rotl32 83 } 84 85 #define DISTSIZE (8192) 86 87 /* Write `size` bytes into `ptr`, all of which are less than or equal to `maxSymb` */ 88 static void RAND_bufferMaxSymb(U32* seed, void* ptr, size_t size, int maxSymb) 89 { 90 size_t i; 91 BYTE* op = ptr; 92 93 for (i = 0; i < size; i++) { 94 op[i] = (BYTE) (RAND(seed) % (maxSymb + 1)); 95 } 96 } 97 98 /* Write `size` random bytes into `ptr` */ 99 static void RAND_buffer(U32* seed, void* ptr, size_t size) 100 { 101 size_t i; 102 BYTE* op = ptr; 103 104 for (i = 0; i + 4 <= size; i += 4) { 105 MEM_writeLE32(op + i, RAND(seed)); 106 } 107 for (; i < size; i++) { 108 op[i] = RAND(seed) & 0xff; 109 } 110 } 111 112 /* Write `size` bytes into `ptr` following the distribution `dist` */ 113 static void RAND_bufferDist(U32* seed, BYTE* dist, void* ptr, size_t size) 114 { 115 size_t i; 116 BYTE* op = ptr; 117 118 for (i = 0; i < size; i++) { 119 op[i] = dist[RAND(seed) % DISTSIZE]; 120 } 121 } 122 123 /* Generate a random distribution where the frequency of each symbol follows a 124 * geometric distribution defined by `weight` 125 * `dist` should have size at least `DISTSIZE` */ 126 static void RAND_genDist(U32* seed, BYTE* dist, double weight) 127 { 128 size_t i = 0; 129 size_t statesLeft = DISTSIZE; 130 BYTE symb = (BYTE) (RAND(seed) % 256); 131 BYTE step = (BYTE) ((RAND(seed) % 256) | 1); /* force it to be odd so it's relatively prime to 256 */ 132 133 while (i < DISTSIZE) { 134 size_t states = ((size_t)(weight * (double)statesLeft)) + 1; 135 size_t j; 136 for (j = 0; j < states && i < DISTSIZE; j++, i++) { 137 dist[i] = symb; 138 } 139 140 symb += step; 141 statesLeft -= states; 142 } 143 } 144 145 /* Generates a random number in the range [min, max) */ 146 static inline U32 RAND_range(U32* seed, U32 min, U32 max) 147 { 148 return (RAND(seed) % (max-min)) + min; 149 } 150 151 #define ROUND(x) ((U32)(x + 0.5)) 152 153 /* Generates a random number in an exponential distribution with mean `mean` */ 154 static double RAND_exp(U32* seed, double mean) 155 { 156 double const u = RAND(seed) / (double) UINT_MAX; 157 return log(1-u) * (-mean); 158 } 159 160 /*-******************************************************* 161 * Constants and Structs 162 *********************************************************/ 163 const char* BLOCK_TYPES[] = {"raw", "rle", "compressed"}; 164 165 #define MAX_DECOMPRESSED_SIZE_LOG 20 166 #define MAX_DECOMPRESSED_SIZE (1ULL << MAX_DECOMPRESSED_SIZE_LOG) 167 168 #define MAX_WINDOW_LOG 22 /* Recommended support is 8MB, so limit to 4MB + mantissa */ 169 170 #define MIN_SEQ_LEN (3) 171 #define MAX_NB_SEQ ((ZSTD_BLOCKSIZE_MAX + MIN_SEQ_LEN - 1) / MIN_SEQ_LEN) 172 173 #ifndef MAX_PATH 174 #ifdef PATH_MAX 175 #define MAX_PATH PATH_MAX 176 #else 177 #define MAX_PATH 256 178 #endif 179 #endif 180 181 BYTE CONTENT_BUFFER[MAX_DECOMPRESSED_SIZE]; 182 BYTE FRAME_BUFFER[MAX_DECOMPRESSED_SIZE * 2]; 183 BYTE LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; 184 185 SeqDef SEQUENCE_BUFFER[MAX_NB_SEQ]; 186 BYTE SEQUENCE_LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; /* storeSeq expects a place to copy literals to */ 187 BYTE SEQUENCE_LLCODE[ZSTD_BLOCKSIZE_MAX]; 188 BYTE SEQUENCE_MLCODE[ZSTD_BLOCKSIZE_MAX]; 189 BYTE SEQUENCE_OFCODE[ZSTD_BLOCKSIZE_MAX]; 190 191 U64 WKSP[HUF_WORKSPACE_SIZE_U64]; 192 193 typedef struct { 194 size_t contentSize; /* 0 means unknown (unless contentSize == windowSize == 0) */ 195 unsigned windowSize; /* contentSize >= windowSize means single segment */ 196 } frameHeader_t; 197 198 /* For repeat modes */ 199 typedef struct { 200 U32 rep[ZSTD_REP_NUM]; 201 202 int hufInit; 203 /* the distribution used in the previous block for repeat mode */ 204 BYTE hufDist[DISTSIZE]; 205 HUF_CElt hufTable [HUF_CTABLE_SIZE_ST(255)]; 206 207 int fseInit; 208 FSE_CTable offcodeCTable [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)]; 209 FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)]; 210 FSE_CTable litlengthCTable [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)]; 211 212 /* Symbols that were present in the previous distribution, for use with 213 * set_repeat */ 214 BYTE litlengthSymbolSet[36]; 215 BYTE offsetSymbolSet[29]; 216 BYTE matchlengthSymbolSet[53]; 217 } cblockStats_t; 218 219 typedef struct { 220 void* data; 221 void* dataStart; 222 void* dataEnd; 223 224 void* src; 225 void* srcStart; 226 void* srcEnd; 227 228 frameHeader_t header; 229 230 cblockStats_t stats; 231 cblockStats_t oldStats; /* so they can be rolled back if uncompressible */ 232 } frame_t; 233 234 typedef struct { 235 int useDict; 236 U32 dictID; 237 size_t dictContentSize; 238 BYTE* dictContent; 239 } dictInfo; 240 241 typedef enum { 242 gt_frame = 0, /* generate frames */ 243 gt_block, /* generate compressed blocks without block/frame headers */ 244 } genType_e; 245 246 #ifndef MIN 247 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 248 #endif 249 250 typedef enum { 251 lt_raw, 252 lt_rle, 253 lt_compressed, 254 } literalType_e; 255 256 /*-******************************************************* 257 * Global variables (set from command line) 258 *********************************************************/ 259 U32 g_maxDecompressedSizeLog = MAX_DECOMPRESSED_SIZE_LOG; /* <= 20 */ 260 U32 g_maxBlockSize = ZSTD_BLOCKSIZE_MAX; /* <= 128 KB */ 261 262 /*-******************************************************* 263 * Generator Functions 264 *********************************************************/ 265 266 struct { 267 int contentSize; /* force the content size to be present */ 268 blockType_e *blockType; /* force specific block type */ 269 literalType_e *literalType; /* force specific literals type */ 270 int frame_header_only; /* generate only frame header */ 271 int no_magic; /* do not generate magic number */ 272 } opts; 273 274 /* Generate and write a random frame header */ 275 static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info) 276 { 277 BYTE* const op = frame->data; 278 size_t pos = 0; 279 frameHeader_t fh; 280 281 BYTE windowByte = 0; 282 283 int singleSegment = 0; 284 int contentSizeFlag = 0; 285 int fcsCode = 0; 286 287 memset(&fh, 0, sizeof(fh)); 288 289 /* generate window size */ 290 { 291 /* Follow window algorithm from specification */ 292 int const exponent = RAND(seed) % (MAX_WINDOW_LOG - 10); 293 int const mantissa = RAND(seed) % 8; 294 windowByte = (BYTE) ((exponent << 3) | mantissa); 295 fh.windowSize = (1U << (exponent + 10)); 296 fh.windowSize += fh.windowSize / 8 * mantissa; 297 } 298 299 { 300 /* Generate random content size */ 301 int force_block_type = opts.blockType != NULL; 302 size_t highBit; 303 if (RAND(seed) & 7 && g_maxDecompressedSizeLog > 7) { 304 /* do content of at least 128 bytes */ 305 highBit = 1ULL << RAND_range(seed, 7, g_maxDecompressedSizeLog); 306 } else if (force_block_type) { 307 if ((RAND(seed) & 3) || (*(opts.blockType) == bt_rle)) { 308 /* do small content */ 309 highBit = 1ULL << RAND_range(seed, 0, MIN(7, 1U << g_maxDecompressedSizeLog)); 310 } else { 311 /* 0 size frame */ 312 highBit = 0; 313 } 314 } else if (RAND(seed) & 3) { 315 /* do small content */ 316 highBit = 1ULL << RAND_range(seed, 0, MIN(7, 1U << g_maxDecompressedSizeLog)); 317 } else { 318 /* 0 size frame */ 319 highBit = 0; 320 } 321 fh.contentSize = highBit ? highBit + (RAND(seed) % highBit) : 0; 322 323 /* provide size sometimes */ 324 contentSizeFlag = opts.contentSize | (RAND(seed) & 1); 325 326 if (contentSizeFlag && (fh.contentSize == 0 || !(RAND(seed) & 7))) { 327 /* do single segment sometimes */ 328 fh.windowSize = (U32) fh.contentSize; 329 singleSegment = 1; 330 } 331 } 332 333 if (contentSizeFlag) { 334 /* Determine how large fcs field has to be */ 335 int minFcsCode = (fh.contentSize >= 256) + 336 (fh.contentSize >= 65536 + 256) + 337 (fh.contentSize > 0xFFFFFFFFU); 338 if (!singleSegment && !minFcsCode) { 339 minFcsCode = 1; 340 } 341 fcsCode = minFcsCode + (RAND(seed) % (4 - minFcsCode)); 342 if (fcsCode == 1 && fh.contentSize < 256) fcsCode++; 343 } 344 345 /* write out the header */ 346 if (!opts.no_magic) { 347 MEM_writeLE32(op + pos, ZSTD_MAGICNUMBER); 348 pos += 4; 349 } 350 351 { 352 /* 353 * fcsCode: 2-bit flag specifying how many bytes used to represent Frame_Content_Size (bits 7-6) 354 * singleSegment: 1-bit flag describing if data must be regenerated within a single continuous memory segment. (bit 5) 355 * contentChecksumFlag: 1-bit flag that is set if frame includes checksum at the end -- set to 1 below (bit 2) 356 * dictBits: 2-bit flag describing how many bytes Dictionary_ID uses -- set to 3 (bits 1-0) 357 * For more information: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header 358 */ 359 int const dictBits = info.useDict ? 3 : 0; 360 BYTE const frameHeaderDescriptor = 361 (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2) | dictBits); 362 op[pos++] = frameHeaderDescriptor; 363 } 364 365 if (!singleSegment) { 366 op[pos++] = windowByte; 367 } 368 if (info.useDict) { 369 MEM_writeLE32(op + pos, (U32) info.dictID); 370 pos += 4; 371 } 372 if (contentSizeFlag) { 373 switch (fcsCode) { 374 default: /* Impossible */ 375 case 0: op[pos++] = (BYTE) fh.contentSize; break; 376 case 1: MEM_writeLE16(op + pos, (U16) (fh.contentSize - 256)); pos += 2; break; 377 case 2: MEM_writeLE32(op + pos, (U32) fh.contentSize); pos += 4; break; 378 case 3: MEM_writeLE64(op + pos, (U64) fh.contentSize); pos += 8; break; 379 } 380 } 381 382 DISPLAYLEVEL(3, " frame content size:\t%u\n", (unsigned)fh.contentSize); 383 DISPLAYLEVEL(3, " frame window size:\t%u\n", fh.windowSize); 384 DISPLAYLEVEL(3, " content size flag:\t%d\n", contentSizeFlag); 385 DISPLAYLEVEL(3, " single segment flag:\t%d\n", singleSegment); 386 387 frame->data = op + pos; 388 frame->header = fh; 389 } 390 391 /* Write a literal block in either raw or RLE form, return the literals size */ 392 static size_t writeLiteralsBlockSimple(U32* seed, frame_t* frame, size_t contentSize) 393 { 394 int force_literal_type = opts.literalType != NULL; 395 int const type = (force_literal_type) ? *(opts.literalType) : RAND(seed) % 2; 396 397 BYTE* op = (BYTE*)frame->data; 398 int const sizeFormatDesc = RAND(seed) % 8; 399 size_t litSize; 400 size_t maxLitSize = MIN(contentSize, g_maxBlockSize); 401 402 if (sizeFormatDesc == 0) { 403 /* Size_FormatDesc = ?0 */ 404 maxLitSize = MIN(maxLitSize, 31); 405 } else if (sizeFormatDesc <= 4) { 406 /* Size_FormatDesc = 01 */ 407 maxLitSize = MIN(maxLitSize, 4095); 408 } else { 409 /* Size_Format = 11 */ 410 maxLitSize = MIN(maxLitSize, 1048575); 411 } 412 413 litSize = RAND(seed) % (maxLitSize + 1); 414 if (frame->src == frame->srcStart && litSize == 0) { 415 litSize = 1; /* no empty literals if there's nothing preceding this block */ 416 } 417 if (litSize + 3 > contentSize) { 418 litSize = contentSize; /* no matches shorter than 3 are allowed */ 419 } 420 /* use smallest size format that fits */ 421 if (litSize < 32) { 422 op[0] = (type | (0 << 2) | (litSize << 3)) & 0xff; 423 op += 1; 424 } else if (litSize < 4096) { 425 op[0] = (type | (1 << 2) | (litSize << 4)) & 0xff; 426 op[1] = (litSize >> 4) & 0xff; 427 op += 2; 428 } else { 429 op[0] = (type | (3 << 2) | (litSize << 4)) & 0xff; 430 op[1] = (litSize >> 4) & 0xff; 431 op[2] = (litSize >> 12) & 0xff; 432 op += 3; 433 } 434 435 if (type == 0) { 436 /* Raw literals */ 437 DISPLAYLEVEL(4, " raw literals\n"); 438 439 RAND_buffer(seed, LITERAL_BUFFER, litSize); 440 memcpy(op, LITERAL_BUFFER, litSize); 441 op += litSize; 442 } else { 443 /* RLE literals */ 444 BYTE const symb = (BYTE) (RAND(seed) % 256); 445 446 DISPLAYLEVEL(4, " rle literals: 0x%02x\n", (unsigned)symb); 447 448 memset(LITERAL_BUFFER, symb, litSize); 449 op[0] = symb; 450 op++; 451 } 452 453 frame->data = op; 454 455 return litSize; 456 } 457 458 /* Generate a Huffman header for the given source */ 459 static size_t writeHufHeader(U32* seed, HUF_CElt* hufTable, void* dst, size_t dstSize, 460 const void* src, size_t srcSize) 461 { 462 BYTE* const ostart = (BYTE*)dst; 463 BYTE* op = ostart; 464 465 unsigned huffLog = 11; 466 unsigned maxSymbolValue = 255; 467 468 unsigned count[HUF_SYMBOLVALUE_MAX+1]; 469 470 /* Scan input and build symbol stats */ 471 { size_t const largest = HIST_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, WKSP, sizeof(WKSP)); 472 assert(!HIST_isError(largest)); 473 if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 0; } /* single symbol, rle */ 474 if (largest <= (srcSize >> 7)+1) return 0; /* Fast heuristic : not compressible enough */ 475 } 476 477 /* Build Huffman Tree */ 478 /* Max Huffman log is 11, min is highbit(maxSymbolValue)+1 */ 479 huffLog = RAND_range(seed, ZSTD_highbit32(maxSymbolValue)+1, huffLog+1); 480 DISPLAYLEVEL(6, " huffman log: %u\n", huffLog); 481 { size_t const maxBits = HUF_buildCTable_wksp (hufTable, count, maxSymbolValue, huffLog, WKSP, sizeof(WKSP)); 482 CHECKERR(maxBits); 483 huffLog = (U32)maxBits; 484 } 485 486 /* Write table description header */ 487 { size_t const hSize = HUF_writeCTable_wksp (op, dstSize, hufTable, maxSymbolValue, huffLog, WKSP, sizeof(WKSP)); 488 if (hSize + 12 >= srcSize) return 0; /* not useful to try compression */ 489 op += hSize; 490 } 491 492 return op - ostart; 493 } 494 495 /* Write a Huffman coded literals block and return the literals size */ 496 static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t contentSize) 497 { 498 BYTE* origop = (BYTE*)frame->data; 499 BYTE* opend = (BYTE*)frame->dataEnd; 500 BYTE* op; 501 BYTE* const ostart = origop; 502 int const sizeFormat = RAND(seed) % 4; 503 size_t litSize; 504 size_t hufHeaderSize = 0; 505 size_t compressedSize = 0; 506 size_t maxLitSize = MIN(contentSize-3, g_maxBlockSize); 507 508 SymbolEncodingType_e hType; 509 510 if (contentSize < 64) { 511 /* make sure we get reasonably-sized literals for compression */ 512 return ERROR(GENERIC); 513 } 514 515 DISPLAYLEVEL(4, " compressed literals\n"); 516 517 switch (sizeFormat) { 518 case 0: /* fall through, size is the same as case 1 */ 519 case 1: 520 maxLitSize = MIN(maxLitSize, 1023); 521 origop += 3; 522 break; 523 case 2: 524 maxLitSize = MIN(maxLitSize, 16383); 525 origop += 4; 526 break; 527 case 3: 528 maxLitSize = MIN(maxLitSize, 262143); 529 origop += 5; 530 break; 531 default:; /* impossible */ 532 } 533 534 do { 535 op = origop; 536 do { 537 litSize = RAND(seed) % (maxLitSize + 1); 538 } while (litSize < 32); /* avoid small literal sizes */ 539 if (litSize + 3 > contentSize) { 540 litSize = contentSize; /* no matches shorter than 3 are allowed */ 541 } 542 543 /* most of the time generate a new distribution */ 544 if ((RAND(seed) & 3) || !frame->stats.hufInit) { 545 do { 546 if (RAND(seed) & 3) { 547 /* add 10 to ensure some compressibility */ 548 double const weight = ((RAND(seed) % 90) + 10) / 100.0; 549 550 DISPLAYLEVEL(5, " distribution weight: %d%%\n", 551 (int)(weight * 100)); 552 553 RAND_genDist(seed, frame->stats.hufDist, weight); 554 } else { 555 /* sometimes do restricted range literals to force 556 * non-huffman headers */ 557 DISPLAYLEVEL(5, " small range literals\n"); 558 RAND_bufferMaxSymb(seed, frame->stats.hufDist, DISTSIZE, 559 15); 560 } 561 RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER, 562 litSize); 563 564 /* generate the header from the distribution instead of the 565 * actual data to avoid bugs with symbols that were in the 566 * distribution but never showed up in the output */ 567 hufHeaderSize = writeHufHeader( 568 seed, frame->stats.hufTable, op, opend - op, 569 frame->stats.hufDist, DISTSIZE); 570 CHECKERR(hufHeaderSize); 571 /* repeat until a valid header is written */ 572 } while (hufHeaderSize == 0); 573 op += hufHeaderSize; 574 hType = set_compressed; 575 576 frame->stats.hufInit = 1; 577 } else { 578 /* repeat the distribution/table from last time */ 579 DISPLAYLEVEL(5, " huffman repeat stats\n"); 580 RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER, 581 litSize); 582 hufHeaderSize = 0; 583 hType = set_repeat; 584 } 585 586 do { 587 compressedSize = 588 sizeFormat == 0 589 ? HUF_compress1X_usingCTable( 590 op, opend - op, LITERAL_BUFFER, litSize, 591 frame->stats.hufTable, /* flags */ 0) 592 : HUF_compress4X_usingCTable( 593 op, opend - op, LITERAL_BUFFER, litSize, 594 frame->stats.hufTable, /* flags */ 0); 595 CHECKERR(compressedSize); 596 /* this only occurs when it could not compress or similar */ 597 } while (compressedSize <= 0); 598 599 op += compressedSize; 600 601 compressedSize += hufHeaderSize; 602 DISPLAYLEVEL(5, " regenerated size: %u\n", (unsigned)litSize); 603 DISPLAYLEVEL(5, " compressed size: %u\n", (unsigned)compressedSize); 604 if (compressedSize >= litSize) { 605 DISPLAYLEVEL(5, " trying again\n"); 606 /* if we have to try again, reset the stats so we don't accidentally 607 * try to repeat a distribution we just made */ 608 frame->stats = frame->oldStats; 609 } else { 610 break; 611 } 612 } while (1); 613 614 /* write header */ 615 switch (sizeFormat) { 616 case 0: /* fall through, size is the same as case 1 */ 617 case 1: { 618 U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) | 619 ((U32)compressedSize << 14); 620 MEM_writeLE24(ostart, header); 621 break; 622 } 623 case 2: { 624 U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) | 625 ((U32)compressedSize << 18); 626 MEM_writeLE32(ostart, header); 627 break; 628 } 629 case 3: { 630 U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) | 631 ((U32)compressedSize << 22); 632 MEM_writeLE32(ostart, header); 633 ostart[4] = (BYTE)(compressedSize >> 10); 634 break; 635 } 636 default:; /* impossible */ 637 } 638 639 frame->data = op; 640 return litSize; 641 } 642 643 static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize) 644 { 645 int select_compressed = 0; 646 if (opts.literalType) { 647 select_compressed = *(opts.literalType) == lt_compressed; 648 } else { 649 /* only do compressed for larger segments to avoid compressibility issues */ 650 select_compressed = RAND(seed) & 7 && contentSize >= 64; 651 } 652 653 if (select_compressed) { 654 return writeLiteralsBlockCompressed(seed, frame, contentSize); 655 } else { 656 return writeLiteralsBlockSimple(seed, frame, contentSize); 657 } 658 } 659 660 static inline void initSeqStore(SeqStore_t *seqStore) { 661 seqStore->maxNbSeq = MAX_NB_SEQ; 662 seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX; 663 seqStore->sequencesStart = SEQUENCE_BUFFER; 664 seqStore->litStart = SEQUENCE_LITERAL_BUFFER; 665 seqStore->llCode = SEQUENCE_LLCODE; 666 seqStore->mlCode = SEQUENCE_MLCODE; 667 seqStore->ofCode = SEQUENCE_OFCODE; 668 669 ZSTD_resetSeqStore(seqStore); 670 } 671 672 /* Randomly generate sequence commands */ 673 static U32 674 generateSequences(U32* seed, frame_t* frame, SeqStore_t* seqStore, 675 size_t contentSize, size_t literalsSize, dictInfo info) 676 { 677 /* The total length of all the matches */ 678 size_t const remainingMatch = contentSize - literalsSize; 679 size_t excessMatch = 0; 680 U32 numSequences = 0; 681 U32 i; 682 683 const BYTE* literals = LITERAL_BUFFER; 684 BYTE* srcPtr = frame->src; 685 686 if (literalsSize != contentSize) { 687 /* each match must be at least MIN_SEQ_LEN, so this is the maximum 688 * number of sequences we can have */ 689 U32 const maxSequences = (U32)remainingMatch / MIN_SEQ_LEN; 690 numSequences = (RAND(seed) % maxSequences) + 1; 691 692 /* the extra match lengths we have to allocate to each sequence */ 693 excessMatch = remainingMatch - numSequences * MIN_SEQ_LEN; 694 } 695 696 DISPLAYLEVEL(5, " total match lengths: %u\n", (unsigned)remainingMatch); 697 for (i = 0; i < numSequences; i++) { 698 /* Generate match and literal lengths by exponential distribution to 699 * ensure nice numbers */ 700 U32 matchLen = 701 MIN_SEQ_LEN + 702 ROUND(RAND_exp(seed, (double)excessMatch / (double)(numSequences - i))); 703 U32 literalLen = 704 (RAND(seed) & 7) 705 ? ROUND(RAND_exp(seed, 706 (double)literalsSize / 707 (double)(numSequences - i))) 708 : 0; 709 /* actual offset, code to send, and point to copy up to when shifting 710 * codes in the repeat offsets history */ 711 U32 offset, offBase, repIndex; 712 713 /* bounds checks */ 714 matchLen = (U32) MIN(matchLen, excessMatch + MIN_SEQ_LEN); 715 literalLen = MIN(literalLen, (U32) literalsSize); 716 if (i == 0 && srcPtr == frame->srcStart && literalLen == 0) literalLen = 1; 717 if (i + 1 == numSequences) matchLen = MIN_SEQ_LEN + (U32) excessMatch; 718 719 memcpy(srcPtr, literals, literalLen); 720 srcPtr += literalLen; 721 do { 722 if (RAND(seed) & 7) { 723 /* do a normal offset */ 724 U32 const dataDecompressed = (U32)((BYTE*)srcPtr-(BYTE*)frame->srcStart); 725 offset = (RAND(seed) % 726 MIN(frame->header.windowSize, 727 (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) + 728 1; 729 if (info.useDict && (RAND(seed) & 1) && i + 1 != numSequences && dataDecompressed < frame->header.windowSize) { 730 /* need to occasionally generate offsets that go past the start */ 731 /* including i+1 != numSequences because the last sequences has to adhere to predetermined contentSize */ 732 U32 lenPastStart = (RAND(seed) % info.dictContentSize) + 1; 733 offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart)+lenPastStart; 734 if (offset > frame->header.windowSize) { 735 if (lenPastStart < MIN_SEQ_LEN) { 736 /* when offset > windowSize, matchLen bound by end of dictionary (lenPastStart) */ 737 /* this also means that lenPastStart must be greater than MIN_SEQ_LEN */ 738 /* make sure lenPastStart does not go past dictionary start though */ 739 lenPastStart = MIN(lenPastStart+MIN_SEQ_LEN, (U32)info.dictContentSize); 740 offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) + lenPastStart; 741 } 742 { U32 const matchLenBound = MIN(frame->header.windowSize, lenPastStart); 743 matchLen = MIN(matchLen, matchLenBound); 744 } 745 } 746 } 747 offBase = OFFSET_TO_OFFBASE(offset); 748 repIndex = 2; 749 } else { 750 /* do a repeat offset */ 751 U32 const randomRepIndex = RAND(seed) % 3; 752 offBase = REPCODE_TO_OFFBASE(randomRepIndex + 1); /* expects values between 1 & 3 */ 753 if (literalLen > 0) { 754 offset = frame->stats.rep[randomRepIndex]; 755 repIndex = randomRepIndex; 756 } else { 757 /* special case : literalLen == 0 */ 758 offset = randomRepIndex == 2 ? frame->stats.rep[0] - 1 759 : frame->stats.rep[randomRepIndex + 1]; 760 repIndex = MIN(2, randomRepIndex + 1); 761 } 762 } 763 } while (((!info.useDict) && (offset > (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) || offset == 0); 764 765 { BYTE* const dictEnd = ZSTD_maybeNullPtrAdd(info.dictContent, info.dictContentSize); 766 size_t j; 767 for (j = 0; j < matchLen; j++) { 768 if ((U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) < offset) { 769 /* copy from dictionary instead of literals */ 770 size_t const dictOffset = offset - (srcPtr - (BYTE*)frame->srcStart); 771 *srcPtr = *(dictEnd - dictOffset); 772 } 773 else { 774 *srcPtr = *(srcPtr-offset); 775 } 776 srcPtr++; 777 } } 778 779 { int r; 780 for (r = repIndex; r > 0; r--) { 781 frame->stats.rep[r] = frame->stats.rep[r - 1]; 782 } 783 frame->stats.rep[0] = offset; 784 } 785 786 DISPLAYLEVEL(6, " LL: %5u OF: %5u ML: %5u", 787 (unsigned)literalLen, (unsigned)offset, (unsigned)matchLen); 788 DISPLAYLEVEL(7, " srcPos: %8u seqNb: %3u", 789 (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart), (unsigned)i); 790 DISPLAYLEVEL(6, "\n"); 791 if (OFFBASE_IS_REPCODE(offBase)) { /* expects sumtype numeric representation of ZSTD_storeSeq() */ 792 DISPLAYLEVEL(7, " repeat offset: %d\n", (int)repIndex); 793 } 794 /* use libzstd sequence handling */ 795 ZSTD_storeSeq(seqStore, literalLen, literals, literals + literalLen, 796 offBase, matchLen); 797 798 literalsSize -= literalLen; 799 excessMatch -= (matchLen - MIN_SEQ_LEN); 800 literals += literalLen; 801 } 802 803 memcpy(srcPtr, literals, literalsSize); 804 srcPtr += literalsSize; 805 DISPLAYLEVEL(6, " excess literals: %5u ", (unsigned)literalsSize); 806 DISPLAYLEVEL(7, "srcPos: %8u ", (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart)); 807 DISPLAYLEVEL(6, "\n"); 808 809 return numSequences; 810 } 811 812 static void initSymbolSet(const BYTE* symbols, size_t len, BYTE* set, BYTE maxSymbolValue) 813 { 814 size_t i; 815 816 memset(set, 0, (size_t)maxSymbolValue+1); 817 818 for (i = 0; i < len; i++) { 819 set[symbols[i]] = 1; 820 } 821 } 822 823 static int isSymbolSubset(const BYTE* symbols, size_t len, const BYTE* set, BYTE maxSymbolValue) 824 { 825 size_t i; 826 827 for (i = 0; i < len; i++) { 828 if (symbols[i] > maxSymbolValue || !set[symbols[i]]) { 829 return 0; 830 } 831 } 832 return 1; 833 } 834 835 static size_t writeSequences(U32* seed, frame_t* frame, SeqStore_t* seqStorePtr, 836 size_t nbSeq) 837 { 838 /* This code is mostly copied from ZSTD_compressSequences in zstd_compress.c */ 839 unsigned count[MaxSeq+1]; 840 S16 norm[MaxSeq+1]; 841 FSE_CTable* CTable_LitLength = frame->stats.litlengthCTable; 842 FSE_CTable* CTable_OffsetBits = frame->stats.offcodeCTable; 843 FSE_CTable* CTable_MatchLength = frame->stats.matchlengthCTable; 844 U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ 845 const SeqDef* const sequences = seqStorePtr->sequencesStart; 846 const BYTE* const ofCodeTable = seqStorePtr->ofCode; 847 const BYTE* const llCodeTable = seqStorePtr->llCode; 848 const BYTE* const mlCodeTable = seqStorePtr->mlCode; 849 BYTE* const oend = (BYTE*)frame->dataEnd; 850 BYTE* op = (BYTE*)frame->data; 851 BYTE* seqHead; 852 BYTE scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE(MaxSeq, MaxFSELog)]; 853 854 /* literals compressing block removed so that can be done separately */ 855 856 /* Sequences Header */ 857 if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall); 858 if (nbSeq < 128) *op++ = (BYTE)nbSeq; 859 else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; 860 else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; 861 862 if (nbSeq==0) { 863 frame->data = op; 864 return 0; 865 } 866 867 /* seqHead : flags for FSE encoding type */ 868 seqHead = op++; 869 870 /* convert length/distances into codes */ 871 ZSTD_seqToCodes(seqStorePtr); 872 873 /* CTable for Literal Lengths */ 874 { unsigned max = MaxLL; 875 size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */ 876 assert(!HIST_isError(mostFrequent)); 877 if (frame->stats.fseInit && !(RAND(seed) & 3) && 878 isSymbolSubset(llCodeTable, nbSeq, 879 frame->stats.litlengthSymbolSet, 35)) { 880 /* maybe do repeat mode if we're allowed to */ 881 LLtype = set_repeat; 882 } else if (mostFrequent == nbSeq) { 883 /* do RLE if we have the chance */ 884 *op++ = llCodeTable[0]; 885 FSE_buildCTable_rle(CTable_LitLength, (BYTE)max); 886 LLtype = set_rle; 887 } else if (!(RAND(seed) & 3)) { 888 /* maybe use the default distribution */ 889 CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer))); 890 LLtype = set_basic; 891 } else { 892 /* fall back on a full table */ 893 size_t nbSeq_1 = nbSeq; 894 const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max); 895 if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; } 896 FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048); 897 { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ 898 if (FSE_isError(NCountSize)) return ERROR(GENERIC); 899 op += NCountSize; } 900 CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer))); 901 LLtype = set_compressed; 902 } } 903 904 /* CTable for Offsets */ 905 /* see Literal Lengths for descriptions of mode choices */ 906 { unsigned max = MaxOff; 907 size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */ 908 assert(!HIST_isError(mostFrequent)); 909 if (frame->stats.fseInit && !(RAND(seed) & 3) && 910 isSymbolSubset(ofCodeTable, nbSeq, 911 frame->stats.offsetSymbolSet, 28)) { 912 Offtype = set_repeat; 913 } else if (mostFrequent == nbSeq) { 914 *op++ = ofCodeTable[0]; 915 FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max); 916 Offtype = set_rle; 917 } else if (!(RAND(seed) & 3)) { 918 FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, DefaultMaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)); 919 Offtype = set_basic; 920 } else { 921 size_t nbSeq_1 = nbSeq; 922 const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max); 923 if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; } 924 FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048); 925 { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ 926 if (FSE_isError(NCountSize)) return ERROR(GENERIC); 927 op += NCountSize; } 928 FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)); 929 Offtype = set_compressed; 930 } } 931 932 /* CTable for MatchLengths */ 933 /* see Literal Lengths for descriptions of mode choices */ 934 { unsigned max = MaxML; 935 size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */ 936 assert(!HIST_isError(mostFrequent)); 937 if (frame->stats.fseInit && !(RAND(seed) & 3) && 938 isSymbolSubset(mlCodeTable, nbSeq, 939 frame->stats.matchlengthSymbolSet, 52)) { 940 MLtype = set_repeat; 941 } else if (mostFrequent == nbSeq) { 942 *op++ = *mlCodeTable; 943 FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max); 944 MLtype = set_rle; 945 } else if (!(RAND(seed) & 3)) { 946 /* sometimes do default distribution */ 947 FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)); 948 MLtype = set_basic; 949 } else { 950 /* fall back on table */ 951 size_t nbSeq_1 = nbSeq; 952 const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max); 953 if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; } 954 FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048); 955 { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */ 956 if (FSE_isError(NCountSize)) return ERROR(GENERIC); 957 op += NCountSize; } 958 FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)); 959 MLtype = set_compressed; 960 } } 961 frame->stats.fseInit = 1; 962 initSymbolSet(llCodeTable, nbSeq, frame->stats.litlengthSymbolSet, 35); 963 initSymbolSet(ofCodeTable, nbSeq, frame->stats.offsetSymbolSet, 28); 964 initSymbolSet(mlCodeTable, nbSeq, frame->stats.matchlengthSymbolSet, 52); 965 966 DISPLAYLEVEL(5, " LL type: %d OF type: %d ML type: %d\n", (unsigned)LLtype, (unsigned)Offtype, (unsigned)MLtype); 967 968 *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); 969 970 /* Encoding Sequences */ 971 { BIT_CStream_t blockStream; 972 FSE_CState_t stateMatchLength; 973 FSE_CState_t stateOffsetBits; 974 FSE_CState_t stateLitLength; 975 976 RETURN_ERROR_IF( 977 ERR_isError(BIT_initCStream(&blockStream, op, oend-op)), 978 dstSize_tooSmall, "not enough space remaining"); 979 980 /* first symbols */ 981 FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); 982 FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); 983 FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); 984 BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); 985 if (MEM_32bits()) BIT_flushBits(&blockStream); 986 BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]); 987 if (MEM_32bits()) BIT_flushBits(&blockStream); 988 BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]); 989 BIT_flushBits(&blockStream); 990 991 { size_t n; 992 for (n=nbSeq-2 ; n<nbSeq ; n--) { /* intentional underflow */ 993 BYTE const llCode = llCodeTable[n]; 994 BYTE const ofCode = ofCodeTable[n]; 995 BYTE const mlCode = mlCodeTable[n]; 996 U32 const llBits = LL_bits[llCode]; 997 U32 const ofBits = ofCode; /* 32b*/ /* 64b*/ 998 U32 const mlBits = ML_bits[mlCode]; 999 /* (7)*/ /* (7)*/ 1000 FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */ 1001 FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */ 1002 if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/ 1003 FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */ 1004 if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog))) 1005 BIT_flushBits(&blockStream); /* (7)*/ 1006 BIT_addBits(&blockStream, sequences[n].litLength, llBits); 1007 if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); 1008 BIT_addBits(&blockStream, sequences[n].mlBase, mlBits); 1009 if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/ 1010 BIT_addBits(&blockStream, sequences[n].offBase, ofBits); /* 31 */ 1011 BIT_flushBits(&blockStream); /* (7)*/ 1012 } } 1013 1014 FSE_flushCState(&blockStream, &stateMatchLength); 1015 FSE_flushCState(&blockStream, &stateOffsetBits); 1016 FSE_flushCState(&blockStream, &stateLitLength); 1017 1018 { size_t const streamSize = BIT_closeCStream(&blockStream); 1019 if (streamSize==0) return ERROR(dstSize_tooSmall); /* not enough space */ 1020 op += streamSize; 1021 } } 1022 1023 frame->data = op; 1024 1025 return 0; 1026 } 1027 1028 static size_t writeSequencesBlock(U32* seed, frame_t* frame, size_t contentSize, 1029 size_t literalsSize, dictInfo info) 1030 { 1031 SeqStore_t seqStore; 1032 size_t numSequences; 1033 1034 1035 initSeqStore(&seqStore); 1036 1037 /* randomly generate sequences */ 1038 numSequences = generateSequences(seed, frame, &seqStore, contentSize, literalsSize, info); 1039 /* write them out to the frame data */ 1040 CHECKERR(writeSequences(seed, frame, &seqStore, numSequences)); 1041 1042 return numSequences; 1043 } 1044 1045 static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize, dictInfo info) 1046 { 1047 BYTE* const blockStart = (BYTE*)frame->data; 1048 size_t literalsSize; 1049 size_t nbSeq; 1050 1051 DISPLAYLEVEL(4, " compressed block:\n"); 1052 1053 literalsSize = writeLiteralsBlock(seed, frame, contentSize); 1054 1055 DISPLAYLEVEL(4, " literals size: %u\n", (unsigned)literalsSize); 1056 1057 nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize, info); 1058 1059 DISPLAYLEVEL(4, " number of sequences: %u\n", (unsigned)nbSeq); 1060 1061 return (BYTE*)frame->data - blockStart; 1062 } 1063 1064 static void writeBlock(U32* seed, frame_t* frame, size_t contentSize, 1065 int lastBlock, dictInfo info) 1066 { 1067 int force_block_type = opts.blockType != NULL; 1068 int const blockTypeDesc = (force_block_type) ? *(opts.blockType) : RAND(seed) % 8; 1069 size_t blockSize; 1070 int blockType; 1071 1072 BYTE *const header = (BYTE*)frame->data; 1073 BYTE *op = header + 3; 1074 1075 DISPLAYLEVEL(4, " block:\n"); 1076 DISPLAYLEVEL(4, " block content size: %u\n", (unsigned)contentSize); 1077 DISPLAYLEVEL(4, " last block: %s\n", lastBlock ? "yes" : "no"); 1078 1079 if (blockTypeDesc == 0) { 1080 /* Raw data frame */ 1081 1082 RAND_buffer(seed, frame->src, contentSize); 1083 memcpy(op, frame->src, contentSize); 1084 1085 op += contentSize; 1086 blockType = 0; 1087 blockSize = contentSize; 1088 } else if (blockTypeDesc == 1 && frame->header.contentSize > 0) { 1089 /* RLE (Don't create RLE block if frame content is 0 since block size of 1 may exceed max block size)*/ 1090 BYTE const symbol = RAND(seed) & 0xff; 1091 1092 op[0] = symbol; 1093 memset(frame->src, symbol, contentSize); 1094 1095 op++; 1096 blockType = 1; 1097 blockSize = contentSize; 1098 } else { 1099 /* compressed, most common */ 1100 size_t compressedSize; 1101 blockType = 2; 1102 1103 frame->oldStats = frame->stats; 1104 1105 frame->data = op; 1106 compressedSize = writeCompressedBlock(seed, frame, contentSize, info); 1107 if (compressedSize >= contentSize && !force_block_type) { /* compressed block must be strictly smaller than uncompressed one */ 1108 blockType = 0; 1109 memcpy(op, frame->src, contentSize); 1110 1111 op += contentSize; 1112 blockSize = contentSize; /* fall back on raw block if data doesn't 1113 compress */ 1114 1115 frame->stats = frame->oldStats; /* don't update the stats */ 1116 } else { 1117 op += compressedSize; 1118 blockSize = compressedSize; 1119 } 1120 } 1121 frame->src = (BYTE*)frame->src + contentSize; 1122 1123 DISPLAYLEVEL(4, " block type: %s\n", BLOCK_TYPES[blockType]); 1124 DISPLAYLEVEL(4, " block size field: %u\n", (unsigned)blockSize); 1125 1126 header[0] = (BYTE) ((lastBlock | (blockType << 1) | (blockSize << 3)) & 0xff); 1127 MEM_writeLE16(header + 1, (U16) (blockSize >> 5)); 1128 1129 frame->data = op; 1130 } 1131 1132 static void writeBlocks(U32* seed, frame_t* frame, dictInfo info) 1133 { 1134 size_t contentLeft = frame->header.contentSize; 1135 size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize); 1136 while (1) { 1137 /* 1 in 4 chance of ending frame */ 1138 int const lastBlock = contentLeft > maxBlockSize ? 0 : !(RAND(seed) & 3); 1139 size_t blockContentSize; 1140 if (lastBlock) { 1141 blockContentSize = contentLeft; 1142 } else { 1143 if (contentLeft > 0 && (RAND(seed) & 7)) { 1144 /* some variable size block */ 1145 blockContentSize = RAND(seed) % (MIN(maxBlockSize, contentLeft)+1); 1146 } else if (contentLeft > maxBlockSize && (RAND(seed) & 1)) { 1147 /* some full size block */ 1148 blockContentSize = maxBlockSize; 1149 } else { 1150 /* some empty block */ 1151 blockContentSize = 0; 1152 } 1153 } 1154 1155 writeBlock(seed, frame, blockContentSize, lastBlock, info); 1156 1157 contentLeft -= blockContentSize; 1158 if (lastBlock) break; 1159 } 1160 } 1161 1162 static void writeChecksum(frame_t* frame) 1163 { 1164 /* write checksum so implementations can verify their output */ 1165 U64 digest = XXH64(frame->srcStart, (BYTE*)frame->src-(BYTE*)frame->srcStart, 0); 1166 DISPLAYLEVEL(3, " checksum: %08x\n", (unsigned)digest); 1167 MEM_writeLE32(frame->data, (U32)digest); 1168 frame->data = (BYTE*)frame->data + 4; 1169 } 1170 1171 static void outputBuffer(const void* buf, size_t size, const char* const path) 1172 { 1173 /* write data out to file */ 1174 const BYTE* ip = (const BYTE*)buf; 1175 FILE* out; 1176 if (path) { 1177 out = fopen(path, "wb"); 1178 } else { 1179 out = stdout; 1180 } 1181 if (!out) { 1182 fprintf(stderr, "Failed to open file at %s: ", path); 1183 perror(NULL); 1184 exit(1); 1185 } 1186 1187 { size_t fsize = size; 1188 size_t written = 0; 1189 while (written < fsize) { 1190 written += fwrite(ip + written, 1, fsize - written, out); 1191 if (ferror(out)) { 1192 fprintf(stderr, "Failed to write to file at %s: ", path); 1193 perror(NULL); 1194 exit(1); 1195 } 1196 } 1197 } 1198 1199 if (path) { 1200 fclose(out); 1201 } 1202 } 1203 1204 static void initFrame(frame_t* fr) 1205 { 1206 memset(fr, 0, sizeof(*fr)); 1207 fr->data = fr->dataStart = FRAME_BUFFER; 1208 fr->dataEnd = FRAME_BUFFER + sizeof(FRAME_BUFFER); 1209 fr->src = fr->srcStart = CONTENT_BUFFER; 1210 fr->srcEnd = CONTENT_BUFFER + sizeof(CONTENT_BUFFER); 1211 1212 /* init repeat codes */ 1213 fr->stats.rep[0] = 1; 1214 fr->stats.rep[1] = 4; 1215 fr->stats.rep[2] = 8; 1216 } 1217 1218 /** 1219 * Generated a single zstd compressed block with no block/frame header. 1220 * Returns the final seed. 1221 */ 1222 static U32 generateCompressedBlock(U32 seed, frame_t* frame, dictInfo info) 1223 { 1224 size_t blockContentSize; 1225 int blockWritten = 0; 1226 BYTE* op; 1227 DISPLAYLEVEL(4, "block seed: %u\n", (unsigned)seed); 1228 initFrame(frame); 1229 op = (BYTE*)frame->data; 1230 1231 while (!blockWritten) { 1232 size_t cSize; 1233 /* generate window size */ 1234 { int const exponent = RAND(&seed) % (MAX_WINDOW_LOG - 10); 1235 int const mantissa = RAND(&seed) % 8; 1236 frame->header.windowSize = (1U << (exponent + 10)); 1237 frame->header.windowSize += (frame->header.windowSize / 8) * mantissa; 1238 } 1239 1240 /* generate content size */ 1241 { size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize); 1242 if (RAND(&seed) & 15) { 1243 /* some full size blocks */ 1244 blockContentSize = maxBlockSize; 1245 } else if (RAND(&seed) & 7 && g_maxBlockSize >= (1U << 7)) { 1246 /* some small blocks <= 128 bytes*/ 1247 blockContentSize = RAND(&seed) % (1U << 7); 1248 } else { 1249 /* some variable size blocks */ 1250 blockContentSize = RAND(&seed) % maxBlockSize; 1251 } 1252 } 1253 1254 /* try generating a compressed block */ 1255 frame->oldStats = frame->stats; 1256 frame->data = op; 1257 cSize = writeCompressedBlock(&seed, frame, blockContentSize, info); 1258 if (cSize >= blockContentSize) { /* compressed size must be strictly smaller than decompressed size : https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks */ 1259 /* data doesn't compress -- try again */ 1260 frame->stats = frame->oldStats; /* don't update the stats */ 1261 DISPLAYLEVEL(5, " can't compress block : try again \n"); 1262 } else { 1263 blockWritten = 1; 1264 DISPLAYLEVEL(4, " block size: %u \n", (unsigned)cSize); 1265 frame->src = (BYTE*)frame->src + blockContentSize; 1266 } 1267 } 1268 return seed; 1269 } 1270 1271 /* Return the final seed */ 1272 static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info) 1273 { 1274 /* generate a complete frame */ 1275 DISPLAYLEVEL(3, "frame seed: %u\n", (unsigned)seed); 1276 initFrame(fr); 1277 1278 1279 writeFrameHeader(&seed, fr, info); 1280 if (opts.frame_header_only) 1281 return seed; 1282 1283 writeBlocks(&seed, fr, info); 1284 writeChecksum(fr); 1285 1286 return seed; 1287 } 1288 1289 /*_******************************************************* 1290 * Dictionary Helper Functions 1291 *********************************************************/ 1292 /* returns 0 if successful, otherwise returns 1 upon error */ 1293 static int genRandomDict(U32 dictID, U32 seed, size_t dictSize, BYTE* fullDict) 1294 { 1295 /* allocate space for samples */ 1296 int ret = 0; 1297 unsigned const numSamples = 4; 1298 size_t sampleSizes[4]; 1299 BYTE* const samples = malloc(5000*sizeof(BYTE)); 1300 if (samples == NULL) { 1301 DISPLAY("Error: could not allocate space for samples\n"); 1302 return 1; 1303 } 1304 1305 /* generate samples */ 1306 { unsigned literalValue = 1; 1307 unsigned samplesPos = 0; 1308 size_t currSize = 1; 1309 while (literalValue <= 4) { 1310 sampleSizes[literalValue - 1] = currSize; 1311 { size_t k; 1312 for (k = 0; k < currSize; k++) { 1313 *(samples + (samplesPos++)) = (BYTE)literalValue; 1314 } } 1315 literalValue++; 1316 currSize *= 16; 1317 } } 1318 1319 { size_t dictWriteSize = 0; 1320 ZDICT_params_t zdictParams; 1321 size_t const headerSize = MAX(dictSize/4, 256); 1322 size_t const dictContentSize = dictSize - headerSize; 1323 BYTE* const dictContent = fullDict + headerSize; 1324 if (dictContentSize < ZDICT_CONTENTSIZE_MIN || dictSize < ZDICT_DICTSIZE_MIN) { 1325 DISPLAY("Error: dictionary size is too small\n"); 1326 ret = 1; 1327 goto exitGenRandomDict; 1328 } 1329 1330 /* init dictionary params */ 1331 memset(&zdictParams, 0, sizeof(zdictParams)); 1332 zdictParams.dictID = dictID; 1333 zdictParams.notificationLevel = 1; 1334 1335 /* fill in dictionary content */ 1336 RAND_buffer(&seed, (void*)dictContent, dictContentSize); 1337 1338 /* finalize dictionary with random samples */ 1339 dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize, 1340 dictContent, dictContentSize, 1341 samples, sampleSizes, numSamples, 1342 zdictParams); 1343 1344 if (ZDICT_isError(dictWriteSize)) { 1345 DISPLAY("Could not finalize dictionary: %s\n", ZDICT_getErrorName(dictWriteSize)); 1346 ret = 1; 1347 } 1348 } 1349 1350 exitGenRandomDict: 1351 free(samples); 1352 return ret; 1353 } 1354 1355 static dictInfo initDictInfo(int useDict, size_t dictContentSize, BYTE* dictContent, U32 dictID){ 1356 /* allocate space statically */ 1357 dictInfo dictOp; 1358 memset(&dictOp, 0, sizeof(dictOp)); 1359 dictOp.useDict = useDict; 1360 dictOp.dictContentSize = dictContentSize; 1361 dictOp.dictContent = dictContent; 1362 dictOp.dictID = dictID; 1363 return dictOp; 1364 } 1365 1366 /*-******************************************************* 1367 * Test Mode 1368 *********************************************************/ 1369 1370 BYTE DECOMPRESSED_BUFFER[MAX_DECOMPRESSED_SIZE]; 1371 1372 static size_t testDecodeSimple(frame_t* fr) 1373 { 1374 /* test decoding the generated data with the simple API */ 1375 size_t const ret = ZSTD_decompress(DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, 1376 fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart); 1377 1378 if (ZSTD_isError(ret)) return ret; 1379 1380 if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart, 1381 (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) { 1382 return ERROR(corruption_detected); 1383 } 1384 1385 return ret; 1386 } 1387 1388 static size_t testDecodeStreaming(frame_t* fr) 1389 { 1390 /* test decoding the generated data with the streaming API */ 1391 ZSTD_DStream* zd = ZSTD_createDStream(); 1392 ZSTD_inBuffer in; 1393 ZSTD_outBuffer out; 1394 size_t ret; 1395 1396 if (!zd) return ERROR(memory_allocation); 1397 1398 in.src = fr->dataStart; 1399 in.pos = 0; 1400 in.size = (BYTE*)fr->data - (BYTE*)fr->dataStart; 1401 1402 out.dst = DECOMPRESSED_BUFFER; 1403 out.pos = 0; 1404 out.size = ZSTD_DStreamOutSize(); 1405 1406 ZSTD_initDStream(zd); 1407 while (1) { 1408 ret = ZSTD_decompressStream(zd, &out, &in); 1409 if (ZSTD_isError(ret)) goto cleanup; /* error */ 1410 if (ret == 0) break; /* frame is done */ 1411 1412 /* force decoding to be done in chunks */ 1413 out.size += MIN(ZSTD_DStreamOutSize(), MAX_DECOMPRESSED_SIZE - out.size); 1414 } 1415 1416 ret = out.pos; 1417 1418 if (memcmp(out.dst, fr->srcStart, out.pos) != 0) { 1419 return ERROR(corruption_detected); 1420 } 1421 1422 cleanup: 1423 ZSTD_freeDStream(zd); 1424 return ret; 1425 } 1426 1427 static size_t testDecodeWithDict(U32 seed, genType_e genType) 1428 { 1429 /* create variables */ 1430 size_t const dictSize = RAND(&seed) % (10 << 20) + ZDICT_DICTSIZE_MIN + ZDICT_CONTENTSIZE_MIN; 1431 U32 const dictID = RAND(&seed); 1432 size_t errorDetected = 0; 1433 BYTE* const fullDict = malloc(dictSize); 1434 if (fullDict == NULL) { 1435 return ERROR(GENERIC); 1436 } 1437 1438 /* generate random dictionary */ 1439 if (genRandomDict(dictID, seed, dictSize, fullDict)) { /* return 0 on success */ 1440 errorDetected = ERROR(GENERIC); 1441 goto dictTestCleanup; 1442 } 1443 1444 1445 { frame_t fr; 1446 dictInfo info; 1447 ZSTD_DCtx* const dctx = ZSTD_createDCtx(); 1448 size_t ret; 1449 1450 /* get dict info */ 1451 { size_t const headerSize = MAX(dictSize/4, 256); 1452 size_t const dictContentSize = dictSize-headerSize; 1453 BYTE* const dictContent = fullDict+headerSize; 1454 info = initDictInfo(1, dictContentSize, dictContent, dictID); 1455 } 1456 1457 /* manually decompress and check difference */ 1458 if (genType == gt_frame) { 1459 /* Test frame */ 1460 generateFrame(seed, &fr, info); 1461 ret = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, 1462 fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, 1463 fullDict, dictSize); 1464 } else { 1465 /* Test block */ 1466 generateCompressedBlock(seed, &fr, info); 1467 ret = ZSTD_decompressBegin_usingDict(dctx, fullDict, dictSize); 1468 if (ZSTD_isError(ret)) { 1469 errorDetected = ret; 1470 ZSTD_freeDCtx(dctx); 1471 goto dictTestCleanup; 1472 } 1473 ret = ZSTD_decompressBlock_deprecated(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, 1474 fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart); 1475 } 1476 ZSTD_freeDCtx(dctx); 1477 1478 if (ZSTD_isError(ret)) { 1479 errorDetected = ret; 1480 goto dictTestCleanup; 1481 } 1482 1483 if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) { 1484 errorDetected = ERROR(corruption_detected); 1485 goto dictTestCleanup; 1486 } 1487 } 1488 1489 dictTestCleanup: 1490 free(fullDict); 1491 return errorDetected; 1492 } 1493 1494 static size_t testDecodeRawBlock(frame_t* fr) 1495 { 1496 ZSTD_DCtx* dctx = ZSTD_createDCtx(); 1497 size_t ret = ZSTD_decompressBegin(dctx); 1498 if (ZSTD_isError(ret)) return ret; 1499 1500 ret = ZSTD_decompressBlock_deprecated( 1501 dctx, 1502 DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE, 1503 fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart); 1504 ZSTD_freeDCtx(dctx); 1505 if (ZSTD_isError(ret)) return ret; 1506 1507 if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart, 1508 (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) { 1509 return ERROR(corruption_detected); 1510 } 1511 1512 return ret; 1513 } 1514 1515 static int runBlockTest(U32* seed) 1516 { 1517 frame_t fr; 1518 U32 const seedCopy = *seed; 1519 { dictInfo const info = initDictInfo(0, 0, NULL, 0); 1520 *seed = generateCompressedBlock(*seed, &fr, info); 1521 } 1522 1523 { size_t const r = testDecodeRawBlock(&fr); 1524 if (ZSTD_isError(r)) { 1525 DISPLAY("Error in block mode on test seed %u: %s\n", 1526 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1527 return 1; 1528 } 1529 } 1530 1531 { size_t const r = testDecodeWithDict(*seed, gt_block); 1532 if (ZSTD_isError(r)) { 1533 DISPLAY("Error in block mode with dictionary on test seed %u: %s\n", 1534 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1535 return 1; 1536 } 1537 } 1538 return 0; 1539 } 1540 1541 static int runFrameTest(U32* seed) 1542 { 1543 frame_t fr; 1544 U32 const seedCopy = *seed; 1545 { dictInfo const info = initDictInfo(0, 0, NULL, 0); 1546 *seed = generateFrame(*seed, &fr, info); 1547 } 1548 1549 { size_t const r = testDecodeSimple(&fr); 1550 if (ZSTD_isError(r)) { 1551 DISPLAY("Error in simple mode on test seed %u: %s\n", 1552 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1553 return 1; 1554 } 1555 } 1556 { size_t const r = testDecodeStreaming(&fr); 1557 if (ZSTD_isError(r)) { 1558 DISPLAY("Error in streaming mode on test seed %u: %s\n", 1559 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1560 return 1; 1561 } 1562 } 1563 { size_t const r = testDecodeWithDict(*seed, gt_frame); /* avoid big dictionaries */ 1564 if (ZSTD_isError(r)) { 1565 DISPLAY("Error in dictionary mode on test seed %u: %s\n", 1566 (unsigned)seedCopy, ZSTD_getErrorName(r)); 1567 return 1; 1568 } 1569 } 1570 return 0; 1571 } 1572 1573 static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS, 1574 genType_e genType) 1575 { 1576 unsigned fnum; 1577 1578 UTIL_time_t const startClock = UTIL_getTime(); 1579 U64 const maxClockSpan = testDurationS * SEC_TO_MICRO; 1580 1581 if (numFiles == 0 && !testDurationS) numFiles = 1; 1582 1583 DISPLAY("seed: %u\n", (unsigned)seed); 1584 1585 for (fnum = 0; fnum < numFiles || UTIL_clockSpanMicro(startClock) < maxClockSpan; fnum++) { 1586 if (fnum < numFiles) 1587 DISPLAYUPDATE("\r%u/%u ", fnum, numFiles); 1588 else 1589 DISPLAYUPDATE("\r%u ", fnum); 1590 1591 { int const ret = (genType == gt_frame) ? 1592 runFrameTest(&seed) : 1593 runBlockTest(&seed); 1594 if (ret) return ret; 1595 } 1596 } 1597 1598 DISPLAY("\r%u tests completed: ", fnum); 1599 DISPLAY("OK\n"); 1600 1601 return 0; 1602 } 1603 1604 /*-******************************************************* 1605 * File I/O 1606 *********************************************************/ 1607 1608 static int generateFile(U32 seed, const char* const path, 1609 const char* const origPath, genType_e genType) 1610 { 1611 frame_t fr; 1612 1613 DISPLAY("seed: %u\n", (unsigned)seed); 1614 1615 { dictInfo const info = initDictInfo(0, 0, NULL, 0); 1616 if (genType == gt_frame) { 1617 generateFrame(seed, &fr, info); 1618 } else { 1619 generateCompressedBlock(seed, &fr, info); 1620 } 1621 } 1622 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path); 1623 if (origPath) { 1624 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath); 1625 } 1626 return 0; 1627 } 1628 1629 static int generateCorpus(U32 seed, unsigned numFiles, const char* const path, 1630 const char* const origPath, genType_e genType) 1631 { 1632 char outPath[MAX_PATH]; 1633 unsigned fnum; 1634 1635 DISPLAY("seed: %u\n", (unsigned)seed); 1636 1637 for (fnum = 0; fnum < numFiles; fnum++) { 1638 frame_t fr; 1639 1640 DISPLAYUPDATE("\r%u/%u ", fnum, numFiles); 1641 1642 { dictInfo const info = initDictInfo(0, 0, NULL, 0); 1643 if (genType == gt_frame) { 1644 seed = generateFrame(seed, &fr, info); 1645 } else { 1646 seed = generateCompressedBlock(seed, &fr, info); 1647 } 1648 } 1649 1650 if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) { 1651 DISPLAY("Error: path too long\n"); 1652 return 1; 1653 } 1654 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath); 1655 1656 if (origPath) { 1657 if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) { 1658 DISPLAY("Error: path too long\n"); 1659 return 1; 1660 } 1661 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath); 1662 } 1663 } 1664 1665 DISPLAY("\r%u/%u \n", fnum, numFiles); 1666 1667 return 0; 1668 } 1669 1670 static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path, 1671 const char* const origPath, const size_t dictSize, 1672 genType_e genType) 1673 { 1674 char outPath[MAX_PATH]; 1675 BYTE* fullDict; 1676 U32 const dictID = RAND(&seed); 1677 int errorDetected = 0; 1678 1679 if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) { 1680 DISPLAY("Error: path too long\n"); 1681 return 1; 1682 } 1683 1684 /* allocate space for the dictionary */ 1685 fullDict = malloc(dictSize); 1686 if (fullDict == NULL) { 1687 DISPLAY("Error: could not allocate space for full dictionary.\n"); 1688 return 1; 1689 } 1690 1691 /* randomly generate the dictionary */ 1692 { int const ret = genRandomDict(dictID, seed, dictSize, fullDict); 1693 if (ret != 0) { 1694 errorDetected = ret; 1695 goto dictCleanup; 1696 } 1697 } 1698 1699 /* write out dictionary */ 1700 if (numFiles != 0) { 1701 if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) { 1702 DISPLAY("Error: dictionary path too long\n"); 1703 errorDetected = 1; 1704 goto dictCleanup; 1705 } 1706 outputBuffer(fullDict, dictSize, outPath); 1707 } 1708 else { 1709 outputBuffer(fullDict, dictSize, "dictionary"); 1710 } 1711 1712 /* generate random compressed/decompressed files */ 1713 { unsigned fnum; 1714 for (fnum = 0; fnum < MAX(numFiles, 1); fnum++) { 1715 frame_t fr; 1716 DISPLAYUPDATE("\r%u/%u ", fnum, numFiles); 1717 { 1718 size_t const headerSize = MAX(dictSize/4, 256); 1719 size_t const dictContentSize = dictSize-headerSize; 1720 BYTE* const dictContent = fullDict+headerSize; 1721 dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID); 1722 if (genType == gt_frame) { 1723 seed = generateFrame(seed, &fr, info); 1724 } else { 1725 seed = generateCompressedBlock(seed, &fr, info); 1726 } 1727 } 1728 1729 if (numFiles != 0) { 1730 if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) { 1731 DISPLAY("Error: path too long\n"); 1732 errorDetected = 1; 1733 goto dictCleanup; 1734 } 1735 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath); 1736 1737 if (origPath) { 1738 if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) { 1739 DISPLAY("Error: path too long\n"); 1740 errorDetected = 1; 1741 goto dictCleanup; 1742 } 1743 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath); 1744 } 1745 } 1746 else { 1747 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path); 1748 if (origPath) { 1749 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath); 1750 } 1751 } 1752 } 1753 } 1754 1755 dictCleanup: 1756 free(fullDict); 1757 return errorDetected; 1758 } 1759 1760 1761 /*_******************************************************* 1762 * Command line 1763 *********************************************************/ 1764 static U32 makeSeed(void) 1765 { 1766 U32 t = (U32) time(NULL); 1767 return XXH32(&t, sizeof(t), 0) % 65536; 1768 } 1769 1770 static unsigned readInt(const char** argument) 1771 { 1772 unsigned val = 0; 1773 while ((**argument>='0') && (**argument<='9')) { 1774 val *= 10; 1775 val += **argument - '0'; 1776 (*argument)++; 1777 } 1778 return val; 1779 } 1780 1781 static void usage(const char* programName) 1782 { 1783 DISPLAY( "Usage :\n"); 1784 DISPLAY( " %s [args]\n", programName); 1785 DISPLAY( "\n"); 1786 DISPLAY( "Arguments :\n"); 1787 DISPLAY( " -p<path> : select output path (default:stdout)\n"); 1788 DISPLAY( " in multiple files mode this should be a directory\n"); 1789 DISPLAY( " -o<path> : select path to output original file (default:no output)\n"); 1790 DISPLAY( " in multiple files mode this should be a directory\n"); 1791 DISPLAY( " -s# : select seed (default:random based on time)\n"); 1792 DISPLAY( " -n# : number of files to generate (default:1)\n"); 1793 DISPLAY( " -t : activate test mode (test files against libzstd instead of outputting them)\n"); 1794 DISPLAY( " -T# : length of time to run tests for\n"); 1795 DISPLAY( " -v : increase verbosity level (default:0, max:7)\n"); 1796 DISPLAY( " -h/H : display help/long help and exit\n"); 1797 } 1798 1799 static void advancedUsage(const char* programName) 1800 { 1801 usage(programName); 1802 DISPLAY( "\n"); 1803 DISPLAY( "Advanced arguments :\n"); 1804 DISPLAY( " --content-size : always include the content size in the frame header\n"); 1805 DISPLAY( " --use-dict=# : include a dictionary used to decompress the corpus\n"); 1806 DISPLAY( " --gen-blocks : generate raw compressed blocks without block/frame headers\n"); 1807 DISPLAY( " --max-block-size-log=# : max block size log, must be in range [2, 17]\n"); 1808 DISPLAY( " --max-content-size-log=# : max content size log, must be <= 20\n"); 1809 DISPLAY( " (this is ignored with gen-blocks)\n"); 1810 DISPLAY( " --block-type=# : force certain block type (raw=0, rle=1, compressed=2)\n"); 1811 DISPLAY( " --frame-header-only : dump only frame header\n"); 1812 DISPLAY( " --no-magic : do not add magic number\n"); 1813 } 1814 1815 /*! readU32FromChar() : 1816 @return : unsigned integer value read from input in `char` format 1817 allows and interprets K, KB, KiB, M, MB and MiB suffix. 1818 Will also modify `*stringPtr`, advancing it to position where it stopped reading. 1819 Note : function result can overflow if digit string > MAX_UINT */ 1820 static unsigned readU32FromChar(const char** stringPtr) 1821 { 1822 unsigned result = 0; 1823 while ((**stringPtr >='0') && (**stringPtr <='9')) 1824 result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; 1825 if ((**stringPtr=='K') || (**stringPtr=='M')) { 1826 result <<= 10; 1827 if (**stringPtr=='M') result <<= 10; 1828 (*stringPtr)++ ; 1829 if (**stringPtr=='i') (*stringPtr)++; 1830 if (**stringPtr=='B') (*stringPtr)++; 1831 } 1832 return result; 1833 } 1834 1835 /** longCommandWArg() : 1836 * check if *stringPtr is the same as longCommand. 1837 * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. 1838 * @return 0 and doesn't modify *stringPtr otherwise. 1839 */ 1840 static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) 1841 { 1842 size_t const comSize = strlen(longCommand); 1843 int const result = !strncmp(*stringPtr, longCommand, comSize); 1844 if (result) *stringPtr += comSize; 1845 return result; 1846 } 1847 1848 int main(int argc, char** argv) 1849 { 1850 U32 seed = 0; 1851 int seedset = 0; 1852 unsigned numFiles = 0; 1853 unsigned testDuration = 0; 1854 int testMode = 0; 1855 const char* path = NULL; 1856 const char* origPath = NULL; 1857 int useDict = 0; 1858 unsigned dictSize = (10 << 10); /* 10 kB default */ 1859 genType_e genType = gt_frame; 1860 1861 int argNb; 1862 1863 /* Check command line */ 1864 for (argNb=1; argNb<argc; argNb++) { 1865 const char* argument = argv[argNb]; 1866 if(!argument) continue; /* Protection if argument empty */ 1867 1868 /* Handle commands. Aggregated commands are allowed */ 1869 if (argument[0]=='-') { 1870 argument++; 1871 while (*argument!=0) { 1872 switch(*argument) 1873 { 1874 case 'h': 1875 usage(argv[0]); 1876 return 0; 1877 case 'H': 1878 advancedUsage(argv[0]); 1879 return 0; 1880 case 'v': 1881 argument++; 1882 g_displayLevel++; 1883 break; 1884 case 's': 1885 argument++; 1886 seedset=1; 1887 seed = readInt(&argument); 1888 break; 1889 case 'n': 1890 argument++; 1891 numFiles = readInt(&argument); 1892 break; 1893 case 'T': 1894 argument++; 1895 testDuration = readInt(&argument); 1896 if (*argument == 'm') { 1897 testDuration *= 60; 1898 argument++; 1899 if (*argument == 'n') argument++; 1900 } 1901 break; 1902 case 'o': 1903 argument++; 1904 origPath = argument; 1905 argument += strlen(argument); 1906 break; 1907 case 'p': 1908 argument++; 1909 path = argument; 1910 argument += strlen(argument); 1911 break; 1912 case 't': 1913 argument++; 1914 testMode = 1; 1915 break; 1916 case '-': 1917 argument++; 1918 if (strcmp(argument, "content-size") == 0) { 1919 opts.contentSize = 1; 1920 } else if (longCommandWArg(&argument, "use-dict=")) { 1921 dictSize = readU32FromChar(&argument); 1922 useDict = 1; 1923 } else if (strcmp(argument, "gen-blocks") == 0) { 1924 genType = gt_block; 1925 } else if (longCommandWArg(&argument, "max-block-size-log=")) { 1926 U32 value = readU32FromChar(&argument); 1927 if (value >= 2 && value <= ZSTD_BLOCKSIZE_MAX) { 1928 g_maxBlockSize = 1U << value; 1929 } 1930 } else if (longCommandWArg(&argument, "max-content-size-log=")) { 1931 U32 value = readU32FromChar(&argument); 1932 g_maxDecompressedSizeLog = 1933 MIN(MAX_DECOMPRESSED_SIZE_LOG, value); 1934 } else if (longCommandWArg(&argument, "block-type=")) { 1935 U32 value = readU32FromChar(&argument); 1936 opts.blockType = malloc(sizeof(blockType_e)); 1937 *(opts.blockType) = value; 1938 } else if (longCommandWArg(&argument, "literal-type=")) { 1939 U32 value = readU32FromChar(&argument); 1940 opts.literalType = malloc(sizeof(literalType_e)); 1941 *(opts.literalType) = value; 1942 } else if (strcmp(argument, "frame-header-only") == 0) { 1943 opts.frame_header_only = 1; 1944 } else if (strcmp(argument, "no-magic") == 0) { 1945 opts.no_magic = 1; 1946 } else { 1947 advancedUsage(argv[0]); 1948 return 1; 1949 } 1950 argument += strlen(argument); 1951 break; 1952 default: 1953 usage(argv[0]); 1954 return 1; 1955 } } } } /* for (argNb=1; argNb<argc; argNb++) */ 1956 1957 if (opts.blockType) { 1958 if ((opts.contentSize == 0) && (*(opts.blockType) == bt_rle)) { 1959 DISPLAY("Error: content-size has to be used together with blockType=1 (rle block)\n"); 1960 return 1; 1961 } 1962 1963 if (opts.literalType && (*(opts.blockType) != bt_compressed)) { 1964 DISPLAY("Error: literal-type can be used only with blockType=2 (compressed block)\n"); 1965 return 1; 1966 } 1967 } 1968 1969 if (!seedset) { 1970 seed = makeSeed(); 1971 } 1972 1973 if (testMode) { 1974 return runTestMode(seed, numFiles, testDuration, genType); 1975 } else { 1976 if (testDuration) { 1977 DISPLAY("Error: -T requires test mode (-t)\n\n"); 1978 usage(argv[0]); 1979 return 1; 1980 } 1981 } 1982 1983 if (!path) { 1984 DISPLAY("Error: path is required in file generation mode\n"); 1985 usage(argv[0]); 1986 return 1; 1987 } 1988 1989 if (numFiles == 0 && useDict == 0) { 1990 return generateFile(seed, path, origPath, genType); 1991 } else if (useDict == 0){ 1992 return generateCorpus(seed, numFiles, path, origPath, genType); 1993 } else { 1994 /* should generate files with a dictionary */ 1995 return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize, genType); 1996 } 1997 1998 } 1999