Home | History | Annotate | Line # | Download | only in contrib
check-internal-format-escaping.py revision 1.1.1.2
      1 #!/usr/bin/env python3
      2 #
      3 # Check gcc.pot file for stylistic issues as described in
      4 # https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
      5 # especially in gcc-internal-format messages.
      6 #
      7 # This file is part of GCC.
      8 #
      9 # GCC is free software; you can redistribute it and/or modify it under
     10 # the terms of the GNU General Public License as published by the Free
     11 # Software Foundation; either version 3, or (at your option) any later
     12 # version.
     13 #
     14 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
     15 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
     16 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     17 # for more details.
     18 #
     19 # You should have received a copy of the GNU General Public License
     20 # along with GCC; see the file COPYING3.  If not see
     21 # <http://www.gnu.org/licenses/>.
     22 
     23 import argparse
     24 import re
     25 from collections import Counter
     26 from typing import Dict, Match
     27 
     28 import polib
     29 
     30 seen_warnings = Counter()
     31 
     32 
     33 def location(msg: polib.POEntry):
     34     if msg.occurrences:
     35         occ = msg.occurrences[0]
     36         return f'{occ[0]}:{occ[1]}'
     37     return '<unknown location>'
     38 
     39 
     40 def warn(msg: polib.POEntry,
     41          diagnostic_id: str, diagnostic: str, include_msgid=True):
     42     """
     43     To suppress a warning for a particular message,
     44     add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
     45     """
     46 
     47     if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
     48         return
     49 
     50     seen_warnings[diagnostic] += 1
     51 
     52     if include_msgid:
     53         print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
     54     else:
     55         print(f'{location(msg)}: {diagnostic}')
     56 
     57 
     58 def lint_gcc_internal_format(msg: polib.POEntry):
     59     """
     60     Checks a single message that has the gcc-internal-format. These
     61     messages use a variety of placeholders like %qs, %<quotes%> and
     62     %q#E.
     63     """
     64 
     65     msgid: str = msg.msgid
     66 
     67     def outside_quotes(m: Match[str]):
     68         before = msgid[:m.start(0)]
     69         return before.count("%<") == before.count("%>")
     70 
     71     def lint_matching_placeholders():
     72         """
     73         Warns when literal values in placeholders are not exactly equal
     74         in the translation. This can happen when doing copy-and-paste
     75         translations of similar messages.
     76 
     77         To avoid these mismatches in the first place,
     78         structurally equal messages are found by
     79         lint_diagnostics_differing_only_in_placeholders.
     80 
     81         This check only applies when checking a finished translation
     82         such as de.po, not gcc.pot.
     83         """
     84 
     85         if not msg.translated():
     86             return
     87 
     88         in_msgid = re.findall('%<[^%]+%>', msgid)
     89         in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
     90 
     91         if set(in_msgid) != set(in_msgstr):
     92             warn(msg,
     93                  'placeholder-mismatch',
     94                  f'placeholder mismatch: msgid has {in_msgid}, '
     95                  f'msgstr has {in_msgstr}',
     96                  include_msgid=False)
     97 
     98     def lint_option_outside_quotes():
     99         for match in re.finditer(r'\S+', msgid):
    100             part = match.group()
    101             if not outside_quotes(match):
    102                 continue
    103 
    104             if part.startswith('-'):
    105                 if len(part) >= 2 and part[1].isalpha():
    106                     if part == '-INF':
    107                         continue
    108 
    109                     warn(msg,
    110                          'option-outside-quotes',
    111                          'command line option outside %<quotes%>')
    112 
    113             if part.startswith('__builtin_'):
    114                 warn(msg,
    115                      'builtin-outside-quotes',
    116                      'builtin function outside %<quotes%>')
    117 
    118     def lint_plain_apostrophe():
    119         for match in re.finditer("[^%]'", msgid):
    120             if outside_quotes(match):
    121                 warn(msg, 'apostrophe', 'apostrophe without leading %')
    122 
    123     def lint_space_before_quote():
    124         """
    125         A space before %< is often the result of string literals that
    126         are joined by the C compiler and neither literal has a space
    127         to separate the words.
    128         """
    129 
    130         for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
    131             if match.group(1) != '%s':
    132                 warn(msg,
    133                      'no-space-before-quote',
    134                      '%< directly following a letter or digit')
    135 
    136     def lint_underscore_outside_quotes():
    137         """
    138         An underscore outside of quotes is used in several contexts,
    139         and many of them violate the GCC Guidelines for Diagnostics:
    140 
    141         * names of GCC-internal compiler functions
    142         * names of GCC-internal data structures
    143         * static_cast and the like (which are legitimate)
    144         """
    145 
    146         for match in re.finditer("_", msgid):
    147             if outside_quotes(match):
    148                 warn(msg,
    149                      'underscore-outside-quotes',
    150                      'underscore outside of %<quotes%>')
    151                 return
    152 
    153     def lint_may_not():
    154         """
    155         The term "may not" may either mean "it could be the case"
    156         or "should not". These two different meanings are sometimes
    157         hard to tell apart.
    158         """
    159 
    160         if re.search(r'\bmay not\b', msgid):
    161             warn(msg,
    162                  'ambiguous-may-not',
    163                  'the term "may not" is ambiguous')
    164 
    165     def lint_unbalanced_quotes():
    166         if msgid.count("%<") != msgid.count("%>"):
    167             warn(msg,
    168                  'unbalanced-quotes',
    169                  'unbalanced %< and %> quotes')
    170 
    171         if msg.translated():
    172             if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
    173                 warn(msg,
    174                      'unbalanced-quotes',
    175                      'unbalanced %< and %> quotes')
    176 
    177     def lint_single_space_after_sentence():
    178         """
    179         After a sentence there should be two spaces.
    180         """
    181 
    182         if re.search(r'[.] [A-Z]', msgid):
    183             warn(msg,
    184                  'single-space-after-sentence',
    185                  'single space after sentence')
    186 
    187     def lint_non_canonical_quotes():
    188         """
    189         Catches %<%s%>, which can be written in the shorter form %qs.
    190         """
    191         match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
    192         if match:
    193             warn(msg,
    194                  'non-canonical-quotes',
    195                  f'placeholder {match.group()} should be written as %qs')
    196 
    197     lint_option_outside_quotes()
    198     lint_plain_apostrophe()
    199     lint_space_before_quote()
    200     lint_underscore_outside_quotes()
    201     lint_may_not()
    202     lint_unbalanced_quotes()
    203     lint_matching_placeholders()
    204     lint_single_space_after_sentence()
    205     lint_non_canonical_quotes()
    206 
    207 
    208 def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
    209     """
    210     Detects messages that are structurally the same, except that they
    211     use different plain strings inside %<quotes%>. These messages can
    212     be merged in order to prevent copy-and-paste mistakes by the
    213     translators.
    214 
    215     See bug 90119.
    216     """
    217 
    218     seen: Dict[str, polib.POEntry] = {}
    219 
    220     for msg in po:
    221         msg: polib.POEntry
    222         msgid = msg.msgid
    223 
    224         normalized = re.sub('%<[^%]+%>', '%qs', msgid)
    225         if normalized not in seen:
    226             seen[normalized] = msg
    227             seen[msgid] = msg
    228             continue
    229 
    230         prev = seen[normalized]
    231         warn(msg,
    232              'same-pattern',
    233              f'same pattern for {repr(msgid)} and '
    234              f'{repr(prev.msgid)} in {location(prev)}',
    235              include_msgid=False)
    236 
    237 
    238 def lint_file(po: polib.POFile):
    239     for msg in po:
    240         msg: polib.POEntry
    241 
    242         if not msg.obsolete and not msg.fuzzy:
    243             if 'gcc-internal-format' in msg.flags:
    244                 lint_gcc_internal_format(msg)
    245 
    246     lint_diagnostics_differing_only_in_placeholders(po)
    247 
    248 
    249 def main():
    250     parser = argparse.ArgumentParser(description='')
    251     parser.add_argument('file', help='pot file')
    252 
    253     args = parser.parse_args()
    254 
    255     po = polib.pofile(args.file)
    256     lint_file(po)
    257 
    258     print()
    259     print('summary:')
    260     for entry in seen_warnings.most_common():
    261         if entry[1] > 1:
    262             print(f'{entry[1]}\t{entry[0]}')
    263 
    264 
    265 if __name__ == '__main__':
    266     main()
    267