Home | History | Annotate | Line # | Download | only in contrib
words.sh revision 1.1.1.3
      1      1.1  christos #!/bin/sh
      2      1.1  christos 
      3  1.1.1.3  christos # Copyright (C) 2019-2024 Free Software Foundation, Inc.
      4      1.1  christos # This program is free software; you can redistribute it and/or modify
      5      1.1  christos # it under the terms of the GNU General Public License as published by
      6      1.1  christos # the Free Software Foundation; either version 3 of the License, or
      7      1.1  christos # (at your option) any later version.
      8      1.1  christos #
      9      1.1  christos # This program is distributed in the hope that it will be useful,
     10      1.1  christos # but WITHOUT ANY WARRANTY; without even the implied warranty of
     11      1.1  christos # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12      1.1  christos # GNU General Public License for more details.
     13      1.1  christos #
     14      1.1  christos # You should have received a copy of the GNU General Public License
     15      1.1  christos # along with this program.  If not, see <http://www.gnu.org/licenses/>.
     16      1.1  christos 
     17      1.1  christos # This script intends to facilitate spell checking of source/doc files.
     18      1.1  christos # It:
     19      1.1  christos # - transforms the files into a list of lowercase words
     20      1.1  christos # - prefixes each word with the frequency
     21      1.1  christos # - filters out words within a frequency range
     22      1.1  christos # - sorts the words, longest first
     23      1.1  christos #
     24      1.1  christos # If '-c' is passed as option, it operates on the C comments only, rather than
     25      1.1  christos # on the entire file.
     26      1.1  christos #
     27      1.1  christos # For:
     28      1.1  christos # ...
     29      1.1  christos # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
     30      1.1  christos # $ ./gdb/contrib/words.sh -c $files
     31      1.1  christos # ...
     32      1.1  christos # it generates a list of ~15000 words prefixed with frequency.
     33      1.1  christos #
     34      1.1  christos # This could be used to generate a dictionary that is kept as part of the
     35      1.1  christos # sources, against which new code can be checked, generating a warning or
     36      1.1  christos # error.  The hope is that misspellings would trigger this frequently, and rare
     37      1.1  christos # words rarely, otherwise the burden of updating the dictionary would be too
     38      1.1  christos # much.
     39      1.1  christos #
     40      1.1  christos # And for:
     41      1.1  christos # ...
     42      1.1  christos # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
     43      1.1  christos # $ ./gdb/contrib/words.sh -c -f 1 $files
     44      1.1  christos # ...
     45      1.1  christos # it generates a list of ~5000 words with frequency 1.
     46      1.1  christos #
     47      1.1  christos # This can be used to scan for misspellings manually.
     48      1.1  christos #
     49      1.1  christos 
     50      1.1  christos minfreq=
     51      1.1  christos maxfreq=
     52      1.1  christos c=false
     53      1.1  christos while [ $# -gt 0 ]; do
     54      1.1  christos     case "$1" in
     55      1.1  christos 	-c)
     56      1.1  christos 	    c=true
     57      1.1  christos 	    shift
     58      1.1  christos 	    ;;
     59      1.1  christos 	--freq|-f)
     60      1.1  christos 	    minfreq=$2
     61      1.1  christos 	    maxfreq=$2
     62      1.1  christos 	    shift 2
     63      1.1  christos 	    ;;
     64      1.1  christos 	--min)
     65      1.1  christos 	    minfreq=$2
     66      1.1  christos 	    if [ "$maxfreq" = "" ]; then
     67      1.1  christos 		maxfreq=0
     68      1.1  christos 	    fi
     69      1.1  christos 	    shift 2
     70      1.1  christos 	    ;;
     71      1.1  christos 	--max)
     72      1.1  christos 	    maxfreq=$2
     73      1.1  christos 	    if [ "$minfreq" = "" ]; then
     74      1.1  christos 		minfreq=0
     75      1.1  christos 	    fi
     76      1.1  christos 	    shift 2
     77      1.1  christos 	    ;;
     78      1.1  christos 	*)
     79      1.1  christos 	    break;
     80      1.1  christos 	    ;;
     81      1.1  christos     esac
     82      1.1  christos done
     83      1.1  christos 
     84      1.1  christos if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
     85      1.1  christos     minfreq=0
     86      1.1  christos     maxfreq=0
     87      1.1  christos fi
     88      1.1  christos 
     89      1.1  christos awkfile=$(mktemp)
     90      1.1  christos trap 'rm -f "$awkfile"' EXIT
     91      1.1  christos 
     92      1.1  christos cat > "$awkfile" <<EOF
     93      1.1  christos BEGIN {
     94      1.1  christos     in_comment=0
     95      1.1  christos }
     96      1.1  christos 
     97      1.1  christos // {
     98      1.1  christos     line=\$0
     99      1.1  christos }
    100      1.1  christos 
    101      1.1  christos /\/\*/ {
    102      1.1  christos     in_comment=1
    103      1.1  christos     sub(/.*\/\*/, "", line)
    104      1.1  christos }
    105      1.1  christos 
    106      1.1  christos /\*\// {
    107      1.1  christos     sub(/\*\/.*/, "", line)
    108      1.1  christos     in_comment=0
    109      1.1  christos     print line
    110      1.1  christos     next
    111      1.1  christos }
    112      1.1  christos 
    113      1.1  christos // {
    114      1.1  christos     if (in_comment) {
    115      1.1  christos 	print line
    116      1.1  christos     }
    117      1.1  christos }
    118      1.1  christos EOF
    119      1.1  christos 
    120      1.1  christos # Stabilize sort.
    121      1.1  christos export LC_ALL=C
    122      1.1  christos 
    123      1.1  christos if $c; then
    124      1.1  christos     awk \
    125      1.1  christos 	-f "$awkfile" \
    126      1.1  christos 	-- "$@"
    127      1.1  christos else
    128      1.1  christos     cat "$@"
    129      1.1  christos fi \
    130      1.1  christos     | sed \
    131      1.1  christos 	  -e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
    132      1.1  christos 	  -e 's/\[/\n/g' \
    133      1.1  christos 	  -e 's/\]/\n/g' \
    134      1.1  christos 	  -e "s/'/\n/g" \
    135      1.1  christos 	  -e 's/[0-9][0-9]*/\n/g' \
    136      1.1  christos 	  -e 's/[ \t]*//g' \
    137      1.1  christos     | tr '[:upper:]' '[:lower:]' \
    138      1.1  christos     | sort \
    139      1.1  christos     | uniq -c \
    140      1.1  christos     | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
    141      1.1  christos                  && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
    142      1.1  christos     | awk '{ print length($0) " " $0; }' \
    143      1.1  christos     | sort -n -r \
    144      1.1  christos     | cut -d ' ' -f 2-
    145