Home | History | Annotate | Line # | Download | only in programs
      1 /*
      2  * Copyright (c) Meta Platforms, Inc. and affiliates.
      3  * All rights reserved.
      4  *
      5  * This source code is licensed under both the BSD-style license (found in the
      6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
      7  * in the COPYING file in the root directory of this source tree).
      8  * You may select, at your option, one of the above-listed licenses.
      9  */
     10 
     11 /* Implementation notes:
     12  *
     13  * This is a very simple lorem ipsum generator
     14  * which features a static list of words
     15  * and print them one after another randomly
     16  * with a fake sentence / paragraph structure.
     17  *
     18  * The goal is to generate a printable text
     19  * that can be used to fake a text compression scenario.
     20  * The resulting compression / ratio curve of the lorem ipsum generator
     21  * is more satisfying than the previous statistical generator,
     22  * which was initially designed for entropy compression,
     23  * and lacks a regularity more representative of text.
     24  *
     25  * The compression ratio achievable on the generated lorem ipsum
     26  * is still a bit too good, presumably because the dictionary is a bit too
     27  * small. It would be possible to create some more complex scheme, notably by
     28  * enlarging the dictionary with a word generator, and adding grammatical rules
     29  * (composition) and syntax rules. But that's probably overkill for the intended
     30  * goal.
     31  */
     32 
     33 #include "lorem.h"
     34 #include <assert.h>
     35 #include <limits.h> /* INT_MAX */
     36 #include <string.h> /* memcpy */
     37 
     38 #define WORD_MAX_SIZE 20
     39 
     40 /* Define the word pool */
     41 static const char* kWords[] = {
     42     "lorem",        "ipsum",      "dolor",       "sit",          "amet",
     43     "consectetur",  "adipiscing", "elit",        "sed",          "do",
     44     "eiusmod",      "tempor",     "incididunt",  "ut",           "labore",
     45     "et",           "dolore",     "magna",       "aliqua",       "dis",
     46     "lectus",       "vestibulum", "mattis",      "ullamcorper",  "velit",
     47     "commodo",      "a",          "lacus",       "arcu",         "magnis",
     48     "parturient",   "montes",     "nascetur",    "ridiculus",    "mus",
     49     "mauris",       "nulla",      "malesuada",   "pellentesque", "eget",
     50     "gravida",      "in",         "dictum",      "non",          "erat",
     51     "nam",          "voluptat",   "maecenas",    "blandit",      "aliquam",
     52     "etiam",        "enim",       "lobortis",    "scelerisque",  "fermentum",
     53     "dui",          "faucibus",   "ornare",      "at",           "elementum",
     54     "eu",           "facilisis",  "odio",        "morbi",        "quis",
     55     "eros",         "donec",      "ac",          "orci",         "purus",
     56     "turpis",       "cursus",     "leo",         "vel",          "porta",
     57     "consequat",    "interdum",   "varius",      "vulputate",    "aliquet",
     58     "pharetra",     "nunc",       "auctor",      "urna",         "id",
     59     "metus",        "viverra",    "nibh",        "cras",         "mi",
     60     "unde",         "omnis",      "iste",        "natus",        "error",
     61     "perspiciatis", "voluptatem", "accusantium", "doloremque",   "laudantium",
     62     "totam",        "rem",        "aperiam",     "eaque",        "ipsa",
     63     "quae",         "ab",         "illo",        "inventore",    "veritatis",
     64     "quasi",        "architecto", "beatae",      "vitae",        "dicta",
     65     "sunt",         "explicabo",  "nemo",        "ipsam",        "quia",
     66     "voluptas",     "aspernatur", "aut",         "odit",         "fugit",
     67     "consequuntur", "magni",      "dolores",     "eos",          "qui",
     68     "ratione",      "sequi",      "nesciunt",    "neque",        "porro",
     69     "quisquam",     "est",        "dolorem",     "adipisci",     "numquam",
     70     "eius",         "modi",       "tempora",     "incidunt",     "magnam",
     71     "quaerat",      "ad",         "minima",      "veniam",       "nostrum",
     72     "ullam",        "corporis",   "suscipit",    "laboriosam",   "nisi",
     73     "aliquid",      "ex",         "ea",          "commodi",      "consequatur",
     74     "autem",        "eum",        "iure",        "voluptate",    "esse",
     75     "quam",         "nihil",      "molestiae",   "illum",        "fugiat",
     76     "quo",          "pariatur",   "vero",        "accusamus",    "iusto",
     77     "dignissimos",  "ducimus",    "blanditiis",  "praesentium",  "voluptatum",
     78     "deleniti",     "atque",      "corrupti",    "quos",         "quas",
     79     "molestias",    "excepturi",  "sint",        "occaecati",    "cupiditate",
     80     "provident",    "similique",  "culpa",       "officia",      "deserunt",
     81     "mollitia",     "animi",      "laborum",     "dolorum",      "fuga",
     82     "harum",        "quidem",     "rerum",       "facilis",      "expedita",
     83     "distinctio",   "libero",     "tempore",     "cum",          "soluta",
     84     "nobis",        "eligendi",   "optio",       "cumque",       "impedit",
     85     "minus",        "quod",       "maxime",      "placeat",      "facere",
     86     "possimus",     "assumenda",  "repellendus", "temporibus",   "quibusdam",
     87     "officiis",     "debitis",    "saepe",       "eveniet",      "voluptates",
     88     "repudiandae",  "recusandae", "itaque",      "earum",        "hic",
     89     "tenetur",      "sapiente",   "delectus",    "reiciendis",   "cillum",
     90     "maiores",      "alias",      "perferendis", "doloribus",    "asperiores",
     91     "repellat",     "minim",      "nostrud",     "exercitation", "ullamco",
     92     "laboris",      "aliquip",    "duis",        "aute",         "irure",
     93 };
     94 static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]);
     95 
     96 /* simple 1-dimension distribution, based on word's length, favors small words
     97  */
     98 static const int kWeights[]    = { 0, 8, 6, 4, 3, 2 };
     99 static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]);
    100 
    101 #define DISTRIB_SIZE_MAX 650
    102 static int g_distrib[DISTRIB_SIZE_MAX] = { 0 };
    103 static unsigned g_distribCount         = 0;
    104 
    105 static void countFreqs(
    106         const char* words[],
    107         size_t nbWords,
    108         const int* weights,
    109         size_t nbWeights)
    110 {
    111     unsigned total = 0;
    112     size_t w;
    113     for (w = 0; w < nbWords; w++) {
    114         size_t len = strlen(words[w]);
    115         int lmax;
    116         if (len >= nbWeights)
    117             len = nbWeights - 1;
    118         lmax = weights[len];
    119         total += (unsigned)lmax;
    120     }
    121     g_distribCount = total;
    122     assert(g_distribCount <= DISTRIB_SIZE_MAX);
    123 }
    124 
    125 static void init_word_distrib(
    126         const char* words[],
    127         size_t nbWords,
    128         const int* weights,
    129         size_t nbWeights)
    130 {
    131     size_t w, d = 0;
    132     countFreqs(words, nbWords, weights, nbWeights);
    133     for (w = 0; w < nbWords; w++) {
    134         size_t len = strlen(words[w]);
    135         int l, lmax;
    136         if (len >= nbWeights)
    137             len = nbWeights - 1;
    138         lmax = weights[len];
    139         for (l = 0; l < lmax; l++) {
    140             g_distrib[d++] = (int)w;
    141         }
    142     }
    143 }
    144 
    145 /* Note: this unit only works when invoked sequentially.
    146  * No concurrent access is allowed */
    147 static char* g_ptr         = NULL;
    148 static size_t g_nbChars    = 0;
    149 static size_t g_maxChars   = 10000000;
    150 static unsigned g_randRoot = 0;
    151 
    152 #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r)))
    153 static unsigned LOREM_rand(unsigned range)
    154 {
    155     static const unsigned prime1 = 2654435761U;
    156     static const unsigned prime2 = 2246822519U;
    157     unsigned rand32              = g_randRoot;
    158     rand32 *= prime1;
    159     rand32 ^= prime2;
    160     rand32     = RDG_rotl32(rand32, 13);
    161     g_randRoot = rand32;
    162     return (unsigned)(((unsigned long long)rand32 * range) >> 32);
    163 }
    164 
    165 static void writeLastCharacters(void)
    166 {
    167     size_t lastChars = g_maxChars - g_nbChars;
    168     assert(g_maxChars >= g_nbChars);
    169     if (lastChars == 0)
    170         return;
    171     g_ptr[g_nbChars++] = '.';
    172     if (lastChars > 2) {
    173         memset(g_ptr + g_nbChars, ' ', lastChars - 2);
    174     }
    175     if (lastChars > 1) {
    176         g_ptr[g_maxChars - 1] = '\n';
    177     }
    178     g_nbChars = g_maxChars;
    179 }
    180 
    181 static void generateWord(const char* word, const char* separator, int upCase)
    182 {
    183     size_t const len = strlen(word) + strlen(separator);
    184     if (g_nbChars + len > g_maxChars) {
    185         writeLastCharacters();
    186         return;
    187     }
    188     memcpy(g_ptr + g_nbChars, word, strlen(word));
    189     if (upCase) {
    190         static const char toUp = 'A' - 'a';
    191         g_ptr[g_nbChars]       = (char)(g_ptr[g_nbChars] + toUp);
    192     }
    193     g_nbChars += strlen(word);
    194     memcpy(g_ptr + g_nbChars, separator, strlen(separator));
    195     g_nbChars += strlen(separator);
    196 }
    197 
    198 static int about(unsigned target)
    199 {
    200     return (int)(LOREM_rand(target) + LOREM_rand(target) + 1);
    201 }
    202 
    203 /* Function to generate a random sentence */
    204 static void generateSentence(int nbWords)
    205 {
    206     int commaPos       = about(9);
    207     int comma2         = commaPos + about(7);
    208     int qmark          = (LOREM_rand(11) == 7);
    209     const char* endSep = qmark ? "? " : ". ";
    210     int i;
    211     for (i = 0; i < nbWords; i++) {
    212         int const wordID       = g_distrib[LOREM_rand(g_distribCount)];
    213         const char* const word = kWords[wordID];
    214         const char* sep        = " ";
    215         if (i == commaPos)
    216             sep = ", ";
    217         if (i == comma2)
    218             sep = ", ";
    219         if (i == nbWords - 1)
    220             sep = endSep;
    221         generateWord(word, sep, i == 0);
    222     }
    223 }
    224 
    225 static void generateParagraph(int nbSentences)
    226 {
    227     int i;
    228     for (i = 0; i < nbSentences; i++) {
    229         int wordsPerSentence = about(11);
    230         generateSentence(wordsPerSentence);
    231     }
    232     if (g_nbChars < g_maxChars) {
    233         g_ptr[g_nbChars++] = '\n';
    234     }
    235     if (g_nbChars < g_maxChars) {
    236         g_ptr[g_nbChars++] = '\n';
    237     }
    238 }
    239 
    240 /* It's "common" for lorem ipsum generators to start with the same first
    241  * pre-defined sentence */
    242 static void generateFirstSentence(void)
    243 {
    244     int i;
    245     for (i = 0; i < 18; i++) {
    246         const char* word      = kWords[i];
    247         const char* separator = " ";
    248         if (i == 4)
    249             separator = ", ";
    250         if (i == 7)
    251             separator = ", ";
    252         generateWord(word, separator, i == 0);
    253     }
    254     generateWord(kWords[18], ". ", 0);
    255 }
    256 
    257 size_t
    258 LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill)
    259 {
    260     g_ptr = (char*)buffer;
    261     assert(size < INT_MAX);
    262     g_maxChars = size;
    263     g_nbChars  = 0;
    264     g_randRoot = seed;
    265     if (g_distribCount == 0) {
    266         init_word_distrib(kWords, kNbWords, kWeights, kNbWeights);
    267     }
    268 
    269     if (first) {
    270         generateFirstSentence();
    271     }
    272     while (g_nbChars < g_maxChars) {
    273         int sentencePerParagraph = about(7);
    274         generateParagraph(sentencePerParagraph);
    275         if (!fill)
    276             break; /* only generate one paragraph in not-fill mode */
    277     }
    278     g_ptr = NULL;
    279     return g_nbChars;
    280 }
    281 
    282 void LOREM_genBuffer(void* buffer, size_t size, unsigned seed)
    283 {
    284     LOREM_genBlock(buffer, size, seed, 1, 1);
    285 }
    286