1 1.13 sevan /* $NetBSD: locate.bigram.c,v 1.13 2016/09/05 00:40:29 sevan Exp $ */ 2 1.3 jtc 3 1.1 cgd /* 4 1.3 jtc * Copyright (c) 1989, 1993 5 1.3 jtc * The Regents of the University of California. All rights reserved. 6 1.1 cgd * 7 1.1 cgd * This code is derived from software contributed to Berkeley by 8 1.1 cgd * James A. Woods. 9 1.1 cgd * 10 1.1 cgd * Redistribution and use in source and binary forms, with or without 11 1.1 cgd * modification, are permitted provided that the following conditions 12 1.1 cgd * are met: 13 1.1 cgd * 1. Redistributions of source code must retain the above copyright 14 1.1 cgd * notice, this list of conditions and the following disclaimer. 15 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright 16 1.1 cgd * notice, this list of conditions and the following disclaimer in the 17 1.1 cgd * documentation and/or other materials provided with the distribution. 18 1.10 agc * 3. Neither the name of the University nor the names of its contributors 19 1.1 cgd * may be used to endorse or promote products derived from this software 20 1.1 cgd * without specific prior written permission. 21 1.1 cgd * 22 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 1.1 cgd * SUCH DAMAGE. 33 1.1 cgd */ 34 1.1 cgd 35 1.6 lukem #include <sys/cdefs.h> 36 1.1 cgd #ifndef lint 37 1.11 lukem __COPYRIGHT("@(#) Copyright (c) 1989, 1993\ 38 1.11 lukem The Regents of the University of California. All rights reserved."); 39 1.1 cgd #endif /* not lint */ 40 1.1 cgd 41 1.1 cgd #ifndef lint 42 1.3 jtc #if 0 43 1.4 jtc static char sccsid[] = "@(#)locate.bigram.c 8.2 (Berkeley) 4/28/95"; 44 1.3 jtc #endif 45 1.13 sevan __RCSID("$NetBSD: locate.bigram.c,v 1.13 2016/09/05 00:40:29 sevan Exp $"); 46 1.1 cgd #endif /* not lint */ 47 1.1 cgd 48 1.1 cgd /* 49 1.1 cgd * bigram < text > bigrams 50 1.1 cgd * 51 1.1 cgd * List bigrams for 'updatedb' script. 52 1.1 cgd * Use 'code' to encode a file using this output. 53 1.1 cgd */ 54 1.1 cgd 55 1.1 cgd #include <stdio.h> 56 1.7 jdolecek #include <stdlib.h> 57 1.8 simonb #include <string.h> 58 1.1 cgd #include <sys/param.h> /* for MAXPATHLEN */ 59 1.1 cgd 60 1.12 drochner static int compare_bigrams(const void *, const void *); 61 1.12 drochner static void add_bigram(u_char, u_char); 62 1.7 jdolecek 63 1.7 jdolecek static char buf1[MAXPATHLEN] = " "; 64 1.7 jdolecek static char buf2[MAXPATHLEN]; 65 1.7 jdolecek 66 1.7 jdolecek struct bigram { 67 1.7 jdolecek int count; 68 1.7 jdolecek u_char b1, b2; /* needed for final sorting */ 69 1.7 jdolecek }; 70 1.7 jdolecek 71 1.7 jdolecek struct bigram bigrams[256 * 256]; 72 1.7 jdolecek 73 1.7 jdolecek static void 74 1.12 drochner add_bigram(u_char i1, u_char i2) 75 1.7 jdolecek { 76 1.9 mycroft if (i1 != '\n' && i2 != '\n') 77 1.9 mycroft bigrams[(i1<<8)+i2].count++; 78 1.7 jdolecek } 79 1.7 jdolecek 80 1.7 jdolecek static int 81 1.12 drochner compare_bigrams(const void *item1, const void *item2) 82 1.7 jdolecek { 83 1.7 jdolecek const struct bigram *it1=item1, *it2=item2; 84 1.1 cgd 85 1.7 jdolecek if (it1->count != it2->count) 86 1.7 jdolecek return it2->count - it1->count; 87 1.7 jdolecek else if (it1->b1 != it2->b1) 88 1.7 jdolecek return it2->b1 - it1->b1; 89 1.7 jdolecek else 90 1.7 jdolecek return it2->b2 - it2->b2; 91 1.7 jdolecek } 92 1.6 lukem 93 1.6 lukem int 94 1.12 drochner main(int argc, char *argv[]) 95 1.1 cgd { 96 1.6 lukem char *cp; 97 1.6 lukem char *oldpath = buf1, *path = buf2; 98 1.7 jdolecek struct bigram *bg; 99 1.7 jdolecek int i; 100 1.7 jdolecek 101 1.7 jdolecek /* initialize bigram array */ 102 1.7 jdolecek memset(bigrams, 0, sizeof(bigrams)); 103 1.7 jdolecek for(i=0; i < 65536; i++) { 104 1.7 jdolecek bigrams[i].b1 = i / 256; 105 1.7 jdolecek bigrams[i].b2 = i % 256; 106 1.7 jdolecek } 107 1.1 cgd 108 1.1 cgd while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { 109 1.1 cgd 110 1.1 cgd /* skip longest common prefix */ 111 1.1 cgd for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) 112 1.4 jtc if ( *oldpath == '\0' ) 113 1.1 cgd break; 114 1.7 jdolecek 115 1.1 cgd /* 116 1.1 cgd * output post-residue bigrams only 117 1.1 cgd */ 118 1.7 jdolecek for(; cp[0] != '\0' && cp[1] != '\0'; cp += 2) 119 1.7 jdolecek add_bigram((u_char)cp[0], (u_char)cp[1]); 120 1.7 jdolecek 121 1.7 jdolecek if (path == buf1) /* swap pointers */ 122 1.1 cgd path = buf2, oldpath = buf1; 123 1.1 cgd else 124 1.1 cgd path = buf1, oldpath = buf2; 125 1.1 cgd } 126 1.7 jdolecek 127 1.7 jdolecek /* sort the bigrams by how many times it appeared and their value */ 128 1.7 jdolecek heapsort((void *)bigrams, 256 * 256, sizeof(struct bigram), 129 1.7 jdolecek compare_bigrams); 130 1.7 jdolecek 131 1.7 jdolecek /* write 128 most frequent bigrams out */ 132 1.7 jdolecek bg = bigrams; 133 1.7 jdolecek for (i = 0; i < 128 && bg->count > 0; i++, bg++) { 134 1.7 jdolecek if (bg->b1 != '\0') 135 1.7 jdolecek fputc(bg->b1, stdout); 136 1.7 jdolecek if (bg->b2 != '\0') 137 1.7 jdolecek fputc(bg->b2, stdout); 138 1.7 jdolecek } 139 1.7 jdolecek 140 1.4 jtc return (0); 141 1.1 cgd } 142