Home | History | Annotate | Line # | Download | only in contrib
words.sh revision 1.1
      1  1.1  christos #!/bin/sh
      2  1.1  christos 
      3  1.1  christos # Copyright (C) 2019-2020 Free Software Foundation, Inc.
      4  1.1  christos # This program is free software; you can redistribute it and/or modify
      5  1.1  christos # it under the terms of the GNU General Public License as published by
      6  1.1  christos # the Free Software Foundation; either version 3 of the License, or
      7  1.1  christos # (at your option) any later version.
      8  1.1  christos #
      9  1.1  christos # This program is distributed in the hope that it will be useful,
     10  1.1  christos # but WITHOUT ANY WARRANTY; without even the implied warranty of
     11  1.1  christos # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12  1.1  christos # GNU General Public License for more details.
     13  1.1  christos #
     14  1.1  christos # You should have received a copy of the GNU General Public License
     15  1.1  christos # along with this program.  If not, see <http://www.gnu.org/licenses/>.
     16  1.1  christos 
     17  1.1  christos # This script intends to facilitate spell checking of source/doc files.
     18  1.1  christos # It:
     19  1.1  christos # - transforms the files into a list of lowercase words
     20  1.1  christos # - prefixes each word with the frequency
     21  1.1  christos # - filters out words within a frequency range
     22  1.1  christos # - sorts the words, longest first
     23  1.1  christos #
     24  1.1  christos # If '-c' is passed as option, it operates on the C comments only, rather than
     25  1.1  christos # on the entire file.
     26  1.1  christos #
     27  1.1  christos # For:
     28  1.1  christos # ...
     29  1.1  christos # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
     30  1.1  christos # $ ./gdb/contrib/words.sh -c $files
     31  1.1  christos # ...
     32  1.1  christos # it generates a list of ~15000 words prefixed with frequency.
     33  1.1  christos #
     34  1.1  christos # This could be used to generate a dictionary that is kept as part of the
     35  1.1  christos # sources, against which new code can be checked, generating a warning or
     36  1.1  christos # error.  The hope is that misspellings would trigger this frequently, and rare
     37  1.1  christos # words rarely, otherwise the burden of updating the dictionary would be too
     38  1.1  christos # much.
     39  1.1  christos #
     40  1.1  christos # And for:
     41  1.1  christos # ...
     42  1.1  christos # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
     43  1.1  christos # $ ./gdb/contrib/words.sh -c -f 1 $files
     44  1.1  christos # ...
     45  1.1  christos # it generates a list of ~5000 words with frequency 1.
     46  1.1  christos #
     47  1.1  christos # This can be used to scan for misspellings manually.
     48  1.1  christos #
     49  1.1  christos 
     50  1.1  christos minfreq=
     51  1.1  christos maxfreq=
     52  1.1  christos c=false
     53  1.1  christos while [ $# -gt 0 ]; do
     54  1.1  christos     case "$1" in
     55  1.1  christos 	-c)
     56  1.1  christos 	    c=true
     57  1.1  christos 	    shift
     58  1.1  christos 	    ;;
     59  1.1  christos 	--freq|-f)
     60  1.1  christos 	    minfreq=$2
     61  1.1  christos 	    maxfreq=$2
     62  1.1  christos 	    shift 2
     63  1.1  christos 	    ;;
     64  1.1  christos 	--min)
     65  1.1  christos 	    minfreq=$2
     66  1.1  christos 	    if [ "$maxfreq" = "" ]; then
     67  1.1  christos 		maxfreq=0
     68  1.1  christos 	    fi
     69  1.1  christos 	    shift 2
     70  1.1  christos 	    ;;
     71  1.1  christos 	--max)
     72  1.1  christos 	    maxfreq=$2
     73  1.1  christos 	    if [ "$minfreq" = "" ]; then
     74  1.1  christos 		minfreq=0
     75  1.1  christos 	    fi
     76  1.1  christos 	    shift 2
     77  1.1  christos 	    ;;
     78  1.1  christos 	*)
     79  1.1  christos 	    break;
     80  1.1  christos 	    ;;
     81  1.1  christos     esac
     82  1.1  christos done
     83  1.1  christos 
     84  1.1  christos if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
     85  1.1  christos     minfreq=0
     86  1.1  christos     maxfreq=0
     87  1.1  christos fi
     88  1.1  christos 
     89  1.1  christos awkfile=$(mktemp)
     90  1.1  christos trap 'rm -f "$awkfile"' EXIT
     91  1.1  christos 
     92  1.1  christos cat > "$awkfile" <<EOF
     93  1.1  christos BEGIN {
     94  1.1  christos     in_comment=0
     95  1.1  christos }
     96  1.1  christos 
     97  1.1  christos // {
     98  1.1  christos     line=\$0
     99  1.1  christos }
    100  1.1  christos 
    101  1.1  christos /\/\*/ {
    102  1.1  christos     in_comment=1
    103  1.1  christos     sub(/.*\/\*/, "", line)
    104  1.1  christos }
    105  1.1  christos 
    106  1.1  christos /\*\// {
    107  1.1  christos     sub(/\*\/.*/, "", line)
    108  1.1  christos     in_comment=0
    109  1.1  christos     print line
    110  1.1  christos     next
    111  1.1  christos }
    112  1.1  christos 
    113  1.1  christos // {
    114  1.1  christos     if (in_comment) {
    115  1.1  christos 	print line
    116  1.1  christos     }
    117  1.1  christos }
    118  1.1  christos EOF
    119  1.1  christos 
    120  1.1  christos # Stabilize sort.
    121  1.1  christos export LC_ALL=C
    122  1.1  christos 
    123  1.1  christos if $c; then
    124  1.1  christos     awk \
    125  1.1  christos 	-f "$awkfile" \
    126  1.1  christos 	-- "$@"
    127  1.1  christos else
    128  1.1  christos     cat "$@"
    129  1.1  christos fi \
    130  1.1  christos     | sed \
    131  1.1  christos 	  -e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
    132  1.1  christos 	  -e 's/\[/\n/g' \
    133  1.1  christos 	  -e 's/\]/\n/g' \
    134  1.1  christos 	  -e "s/'/\n/g" \
    135  1.1  christos 	  -e 's/[0-9][0-9]*/\n/g' \
    136  1.1  christos 	  -e 's/[ \t]*//g' \
    137  1.1  christos     | tr '[:upper:]' '[:lower:]' \
    138  1.1  christos     | sort \
    139  1.1  christos     | uniq -c \
    140  1.1  christos     | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
    141  1.1  christos                  && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
    142  1.1  christos     | awk '{ print length($0) " " $0; }' \
    143  1.1  christos     | sort -n -r \
    144  1.1  christos     | cut -d ' ' -f 2-
    145