1 #!/bin/sh 2 3 # Copyright (C) 2019-2024 Free Software Foundation, Inc. 4 # This program is free software; you can redistribute it and/or modify 5 # it under the terms of the GNU General Public License as published by 6 # the Free Software Foundation; either version 3 of the License, or 7 # (at your option) any later version. 8 # 9 # This program is distributed in the hope that it will be useful, 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 # GNU General Public License for more details. 13 # 14 # You should have received a copy of the GNU General Public License 15 # along with this program. If not, see <http://www.gnu.org/licenses/>. 16 17 # This script intends to facilitate spell checking of source/doc files. 18 # It: 19 # - transforms the files into a list of lowercase words 20 # - prefixes each word with the frequency 21 # - filters out words within a frequency range 22 # - sorts the words, longest first 23 # 24 # If '-c' is passed as option, it operates on the C comments only, rather than 25 # on the entire file. 26 # 27 # For: 28 # ... 29 # $ files=$(find gdb -type f -name "*.c" -o -name "*.h") 30 # $ ./gdb/contrib/words.sh -c $files 31 # ... 32 # it generates a list of ~15000 words prefixed with frequency. 33 # 34 # This could be used to generate a dictionary that is kept as part of the 35 # sources, against which new code can be checked, generating a warning or 36 # error. The hope is that misspellings would trigger this frequently, and rare 37 # words rarely, otherwise the burden of updating the dictionary would be too 38 # much. 39 # 40 # And for: 41 # ... 42 # $ files=$(find gdb -type f -name "*.c" -o -name "*.h") 43 # $ ./gdb/contrib/words.sh -c -f 1 $files 44 # ... 45 # it generates a list of ~5000 words with frequency 1. 46 # 47 # This can be used to scan for misspellings manually. 48 # 49 50 minfreq= 51 maxfreq= 52 c=false 53 while [ $# -gt 0 ]; do 54 case "$1" in 55 -c) 56 c=true 57 shift 58 ;; 59 --freq|-f) 60 minfreq=$2 61 maxfreq=$2 62 shift 2 63 ;; 64 --min) 65 minfreq=$2 66 if [ "$maxfreq" = "" ]; then 67 maxfreq=0 68 fi 69 shift 2 70 ;; 71 --max) 72 maxfreq=$2 73 if [ "$minfreq" = "" ]; then 74 minfreq=0 75 fi 76 shift 2 77 ;; 78 *) 79 break; 80 ;; 81 esac 82 done 83 84 if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then 85 minfreq=0 86 maxfreq=0 87 fi 88 89 awkfile=$(mktemp) 90 trap 'rm -f "$awkfile"' EXIT 91 92 cat > "$awkfile" <<EOF 93 BEGIN { 94 in_comment=0 95 } 96 97 // { 98 line=\$0 99 } 100 101 /\/\*/ { 102 in_comment=1 103 sub(/.*\/\*/, "", line) 104 } 105 106 /\*\// { 107 sub(/\*\/.*/, "", line) 108 in_comment=0 109 print line 110 next 111 } 112 113 // { 114 if (in_comment) { 115 print line 116 } 117 } 118 EOF 119 120 # Stabilize sort. 121 export LC_ALL=C 122 123 if $c; then 124 awk \ 125 -f "$awkfile" \ 126 -- "$@" 127 else 128 cat "$@" 129 fi \ 130 | sed \ 131 -e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \ 132 -e 's/\[/\n/g' \ 133 -e 's/\]/\n/g' \ 134 -e "s/'/\n/g" \ 135 -e 's/[0-9][0-9]*/\n/g' \ 136 -e 's/[ \t]*//g' \ 137 | tr '[:upper:]' '[:lower:]' \ 138 | sort \ 139 | uniq -c \ 140 | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \ 141 && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \ 142 | awk '{ print length($0) " " $0; }' \ 143 | sort -n -r \ 144 | cut -d ' ' -f 2- 145