Home | History | Annotate | Line # | Download | only in contrib
      1 #!/bin/sh
      2 
      3 # Copyright (C) 2019-2024 Free Software Foundation, Inc.
      4 # This program is free software; you can redistribute it and/or modify
      5 # it under the terms of the GNU General Public License as published by
      6 # the Free Software Foundation; either version 3 of the License, or
      7 # (at your option) any later version.
      8 #
      9 # This program is distributed in the hope that it will be useful,
     10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
     11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12 # GNU General Public License for more details.
     13 #
     14 # You should have received a copy of the GNU General Public License
     15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
     16 
     17 # This script intends to facilitate spell checking of source/doc files.
     18 # It:
     19 # - transforms the files into a list of lowercase words
     20 # - prefixes each word with the frequency
     21 # - filters out words within a frequency range
     22 # - sorts the words, longest first
     23 #
     24 # If '-c' is passed as option, it operates on the C comments only, rather than
     25 # on the entire file.
     26 #
     27 # For:
     28 # ...
     29 # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
     30 # $ ./gdb/contrib/words.sh -c $files
     31 # ...
     32 # it generates a list of ~15000 words prefixed with frequency.
     33 #
     34 # This could be used to generate a dictionary that is kept as part of the
     35 # sources, against which new code can be checked, generating a warning or
     36 # error.  The hope is that misspellings would trigger this frequently, and rare
     37 # words rarely, otherwise the burden of updating the dictionary would be too
     38 # much.
     39 #
     40 # And for:
     41 # ...
     42 # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
     43 # $ ./gdb/contrib/words.sh -c -f 1 $files
     44 # ...
     45 # it generates a list of ~5000 words with frequency 1.
     46 #
     47 # This can be used to scan for misspellings manually.
     48 #
     49 
     50 minfreq=
     51 maxfreq=
     52 c=false
     53 while [ $# -gt 0 ]; do
     54     case "$1" in
     55 	-c)
     56 	    c=true
     57 	    shift
     58 	    ;;
     59 	--freq|-f)
     60 	    minfreq=$2
     61 	    maxfreq=$2
     62 	    shift 2
     63 	    ;;
     64 	--min)
     65 	    minfreq=$2
     66 	    if [ "$maxfreq" = "" ]; then
     67 		maxfreq=0
     68 	    fi
     69 	    shift 2
     70 	    ;;
     71 	--max)
     72 	    maxfreq=$2
     73 	    if [ "$minfreq" = "" ]; then
     74 		minfreq=0
     75 	    fi
     76 	    shift 2
     77 	    ;;
     78 	*)
     79 	    break;
     80 	    ;;
     81     esac
     82 done
     83 
     84 if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
     85     minfreq=0
     86     maxfreq=0
     87 fi
     88 
     89 awkfile=$(mktemp)
     90 trap 'rm -f "$awkfile"' EXIT
     91 
     92 cat > "$awkfile" <<EOF
     93 BEGIN {
     94     in_comment=0
     95 }
     96 
     97 // {
     98     line=\$0
     99 }
    100 
    101 /\/\*/ {
    102     in_comment=1
    103     sub(/.*\/\*/, "", line)
    104 }
    105 
    106 /\*\// {
    107     sub(/\*\/.*/, "", line)
    108     in_comment=0
    109     print line
    110     next
    111 }
    112 
    113 // {
    114     if (in_comment) {
    115 	print line
    116     }
    117 }
    118 EOF
    119 
    120 # Stabilize sort.
    121 export LC_ALL=C
    122 
    123 if $c; then
    124     awk \
    125 	-f "$awkfile" \
    126 	-- "$@"
    127 else
    128     cat "$@"
    129 fi \
    130     | sed \
    131 	  -e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
    132 	  -e 's/\[/\n/g' \
    133 	  -e 's/\]/\n/g' \
    134 	  -e "s/'/\n/g" \
    135 	  -e 's/[0-9][0-9]*/\n/g' \
    136 	  -e 's/[ \t]*//g' \
    137     | tr '[:upper:]' '[:lower:]' \
    138     | sort \
    139     | uniq -c \
    140     | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
    141                  && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
    142     | awk '{ print length($0) " " $0; }' \
    143     | sort -n -r \
    144     | cut -d ' ' -f 2-
    145