check-internal-format-escaping.py revision 1.1.1.3 1 1.1 mrg #!/usr/bin/env python3
2 1.1 mrg #
3 1.1.1.2 mrg # Check gcc.pot file for stylistic issues as described in
4 1.1.1.2 mrg # https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
5 1.1.1.2 mrg # especially in gcc-internal-format messages.
6 1.1 mrg #
7 1.1 mrg # This file is part of GCC.
8 1.1 mrg #
9 1.1 mrg # GCC is free software; you can redistribute it and/or modify it under
10 1.1 mrg # the terms of the GNU General Public License as published by the Free
11 1.1 mrg # Software Foundation; either version 3, or (at your option) any later
12 1.1 mrg # version.
13 1.1 mrg #
14 1.1 mrg # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 1.1 mrg # WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 1.1 mrg # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 1.1 mrg # for more details.
18 1.1 mrg #
19 1.1 mrg # You should have received a copy of the GNU General Public License
20 1.1 mrg # along with GCC; see the file COPYING3. If not see
21 1.1.1.2 mrg # <http://www.gnu.org/licenses/>.
22 1.1 mrg
23 1.1 mrg import argparse
24 1.1 mrg import re
25 1.1.1.2 mrg from collections import Counter
26 1.1.1.2 mrg from typing import Dict, Match
27 1.1.1.2 mrg
28 1.1.1.2 mrg import polib
29 1.1.1.2 mrg
30 1.1.1.2 mrg seen_warnings = Counter()
31 1.1.1.2 mrg
32 1.1.1.2 mrg
33 1.1.1.2 mrg def location(msg: polib.POEntry):
34 1.1.1.2 mrg if msg.occurrences:
35 1.1.1.2 mrg occ = msg.occurrences[0]
36 1.1.1.2 mrg return f'{occ[0]}:{occ[1]}'
37 1.1.1.2 mrg return '<unknown location>'
38 1.1.1.2 mrg
39 1.1.1.2 mrg
40 1.1.1.2 mrg def warn(msg: polib.POEntry,
41 1.1.1.2 mrg diagnostic_id: str, diagnostic: str, include_msgid=True):
42 1.1.1.2 mrg """
43 1.1.1.2 mrg To suppress a warning for a particular message,
44 1.1.1.2 mrg add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
45 1.1.1.2 mrg """
46 1.1.1.2 mrg
47 1.1.1.2 mrg if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
48 1.1.1.2 mrg return
49 1.1.1.2 mrg
50 1.1.1.2 mrg seen_warnings[diagnostic] += 1
51 1.1.1.2 mrg
52 1.1.1.2 mrg if include_msgid:
53 1.1.1.2 mrg print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
54 1.1.1.2 mrg else:
55 1.1.1.2 mrg print(f'{location(msg)}: {diagnostic}')
56 1.1.1.2 mrg
57 1.1.1.2 mrg
58 1.1.1.2 mrg def lint_gcc_internal_format(msg: polib.POEntry):
59 1.1.1.2 mrg """
60 1.1.1.2 mrg Checks a single message that has the gcc-internal-format. These
61 1.1.1.2 mrg messages use a variety of placeholders like %qs, %<quotes%> and
62 1.1.1.2 mrg %q#E.
63 1.1.1.2 mrg """
64 1.1.1.2 mrg
65 1.1.1.2 mrg msgid: str = msg.msgid
66 1.1.1.2 mrg
67 1.1.1.2 mrg def outside_quotes(m: Match[str]):
68 1.1.1.2 mrg before = msgid[:m.start(0)]
69 1.1.1.3 mrg return before.count('%<') == before.count('%>')
70 1.1.1.2 mrg
71 1.1.1.2 mrg def lint_matching_placeholders():
72 1.1.1.2 mrg """
73 1.1.1.2 mrg Warns when literal values in placeholders are not exactly equal
74 1.1.1.2 mrg in the translation. This can happen when doing copy-and-paste
75 1.1.1.2 mrg translations of similar messages.
76 1.1.1.2 mrg
77 1.1.1.2 mrg To avoid these mismatches in the first place,
78 1.1.1.2 mrg structurally equal messages are found by
79 1.1.1.2 mrg lint_diagnostics_differing_only_in_placeholders.
80 1.1.1.2 mrg
81 1.1.1.2 mrg This check only applies when checking a finished translation
82 1.1.1.2 mrg such as de.po, not gcc.pot.
83 1.1.1.2 mrg """
84 1.1.1.2 mrg
85 1.1.1.2 mrg if not msg.translated():
86 1.1.1.2 mrg return
87 1.1.1.2 mrg
88 1.1.1.2 mrg in_msgid = re.findall('%<[^%]+%>', msgid)
89 1.1.1.2 mrg in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
90 1.1.1.2 mrg
91 1.1.1.2 mrg if set(in_msgid) != set(in_msgstr):
92 1.1.1.2 mrg warn(msg,
93 1.1.1.2 mrg 'placeholder-mismatch',
94 1.1.1.2 mrg f'placeholder mismatch: msgid has {in_msgid}, '
95 1.1.1.2 mrg f'msgstr has {in_msgstr}',
96 1.1.1.2 mrg include_msgid=False)
97 1.1.1.2 mrg
98 1.1.1.2 mrg def lint_option_outside_quotes():
99 1.1.1.2 mrg for match in re.finditer(r'\S+', msgid):
100 1.1.1.2 mrg part = match.group()
101 1.1.1.2 mrg if not outside_quotes(match):
102 1.1.1.2 mrg continue
103 1.1.1.2 mrg
104 1.1.1.2 mrg if part.startswith('-'):
105 1.1.1.2 mrg if len(part) >= 2 and part[1].isalpha():
106 1.1.1.2 mrg if part == '-INF':
107 1.1.1.2 mrg continue
108 1.1.1.2 mrg
109 1.1.1.2 mrg warn(msg,
110 1.1.1.2 mrg 'option-outside-quotes',
111 1.1.1.2 mrg 'command line option outside %<quotes%>')
112 1.1.1.2 mrg
113 1.1.1.2 mrg if part.startswith('__builtin_'):
114 1.1.1.2 mrg warn(msg,
115 1.1.1.2 mrg 'builtin-outside-quotes',
116 1.1.1.2 mrg 'builtin function outside %<quotes%>')
117 1.1.1.2 mrg
118 1.1.1.2 mrg def lint_plain_apostrophe():
119 1.1.1.2 mrg for match in re.finditer("[^%]'", msgid):
120 1.1.1.2 mrg if outside_quotes(match):
121 1.1.1.2 mrg warn(msg, 'apostrophe', 'apostrophe without leading %')
122 1.1.1.2 mrg
123 1.1.1.2 mrg def lint_space_before_quote():
124 1.1.1.2 mrg """
125 1.1.1.2 mrg A space before %< is often the result of string literals that
126 1.1.1.2 mrg are joined by the C compiler and neither literal has a space
127 1.1.1.2 mrg to separate the words.
128 1.1.1.2 mrg """
129 1.1.1.2 mrg
130 1.1.1.3 mrg for match in re.finditer('(.?[a-zA-Z0-9])%<', msgid):
131 1.1.1.2 mrg if match.group(1) != '%s':
132 1.1.1.2 mrg warn(msg,
133 1.1.1.2 mrg 'no-space-before-quote',
134 1.1.1.2 mrg '%< directly following a letter or digit')
135 1.1.1.2 mrg
136 1.1.1.2 mrg def lint_underscore_outside_quotes():
137 1.1.1.2 mrg """
138 1.1.1.2 mrg An underscore outside of quotes is used in several contexts,
139 1.1.1.2 mrg and many of them violate the GCC Guidelines for Diagnostics:
140 1.1.1.2 mrg
141 1.1.1.2 mrg * names of GCC-internal compiler functions
142 1.1.1.2 mrg * names of GCC-internal data structures
143 1.1.1.2 mrg * static_cast and the like (which are legitimate)
144 1.1.1.2 mrg """
145 1.1.1.2 mrg
146 1.1.1.3 mrg for match in re.finditer('_', msgid):
147 1.1.1.2 mrg if outside_quotes(match):
148 1.1.1.2 mrg warn(msg,
149 1.1.1.2 mrg 'underscore-outside-quotes',
150 1.1.1.2 mrg 'underscore outside of %<quotes%>')
151 1.1.1.2 mrg return
152 1.1.1.2 mrg
153 1.1.1.2 mrg def lint_may_not():
154 1.1.1.2 mrg """
155 1.1.1.2 mrg The term "may not" may either mean "it could be the case"
156 1.1.1.2 mrg or "should not". These two different meanings are sometimes
157 1.1.1.2 mrg hard to tell apart.
158 1.1.1.2 mrg """
159 1.1.1.2 mrg
160 1.1.1.2 mrg if re.search(r'\bmay not\b', msgid):
161 1.1.1.2 mrg warn(msg,
162 1.1.1.2 mrg 'ambiguous-may-not',
163 1.1.1.2 mrg 'the term "may not" is ambiguous')
164 1.1.1.2 mrg
165 1.1.1.2 mrg def lint_unbalanced_quotes():
166 1.1.1.3 mrg if msgid.count('%<') != msgid.count('%>'):
167 1.1.1.2 mrg warn(msg,
168 1.1.1.2 mrg 'unbalanced-quotes',
169 1.1.1.2 mrg 'unbalanced %< and %> quotes')
170 1.1.1.2 mrg
171 1.1.1.2 mrg if msg.translated():
172 1.1.1.3 mrg if msg.msgstr.count('%<') != msg.msgstr.count('%>'):
173 1.1.1.2 mrg warn(msg,
174 1.1.1.2 mrg 'unbalanced-quotes',
175 1.1.1.2 mrg 'unbalanced %< and %> quotes')
176 1.1.1.2 mrg
177 1.1.1.2 mrg def lint_single_space_after_sentence():
178 1.1.1.2 mrg """
179 1.1.1.2 mrg After a sentence there should be two spaces.
180 1.1.1.2 mrg """
181 1.1.1.2 mrg
182 1.1.1.2 mrg if re.search(r'[.] [A-Z]', msgid):
183 1.1.1.2 mrg warn(msg,
184 1.1.1.2 mrg 'single-space-after-sentence',
185 1.1.1.2 mrg 'single space after sentence')
186 1.1.1.2 mrg
187 1.1.1.2 mrg def lint_non_canonical_quotes():
188 1.1.1.2 mrg """
189 1.1.1.2 mrg Catches %<%s%>, which can be written in the shorter form %qs.
190 1.1.1.2 mrg """
191 1.1.1.2 mrg match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
192 1.1.1.2 mrg if match:
193 1.1.1.2 mrg warn(msg,
194 1.1.1.2 mrg 'non-canonical-quotes',
195 1.1.1.2 mrg f'placeholder {match.group()} should be written as %qs')
196 1.1.1.2 mrg
197 1.1.1.2 mrg lint_option_outside_quotes()
198 1.1.1.2 mrg lint_plain_apostrophe()
199 1.1.1.2 mrg lint_space_before_quote()
200 1.1.1.2 mrg lint_underscore_outside_quotes()
201 1.1.1.2 mrg lint_may_not()
202 1.1.1.2 mrg lint_unbalanced_quotes()
203 1.1.1.2 mrg lint_matching_placeholders()
204 1.1.1.2 mrg lint_single_space_after_sentence()
205 1.1.1.2 mrg lint_non_canonical_quotes()
206 1.1.1.2 mrg
207 1.1.1.2 mrg
208 1.1.1.2 mrg def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
209 1.1.1.2 mrg """
210 1.1.1.2 mrg Detects messages that are structurally the same, except that they
211 1.1.1.2 mrg use different plain strings inside %<quotes%>. These messages can
212 1.1.1.2 mrg be merged in order to prevent copy-and-paste mistakes by the
213 1.1.1.2 mrg translators.
214 1.1.1.2 mrg
215 1.1.1.2 mrg See bug 90119.
216 1.1.1.2 mrg """
217 1.1.1.2 mrg
218 1.1.1.2 mrg seen: Dict[str, polib.POEntry] = {}
219 1.1.1.2 mrg
220 1.1.1.2 mrg for msg in po:
221 1.1.1.2 mrg msg: polib.POEntry
222 1.1.1.2 mrg msgid = msg.msgid
223 1.1.1.2 mrg
224 1.1.1.2 mrg normalized = re.sub('%<[^%]+%>', '%qs', msgid)
225 1.1.1.2 mrg if normalized not in seen:
226 1.1.1.2 mrg seen[normalized] = msg
227 1.1.1.2 mrg seen[msgid] = msg
228 1.1.1.2 mrg continue
229 1.1.1.2 mrg
230 1.1.1.2 mrg prev = seen[normalized]
231 1.1.1.2 mrg warn(msg,
232 1.1.1.2 mrg 'same-pattern',
233 1.1.1.2 mrg f'same pattern for {repr(msgid)} and '
234 1.1.1.2 mrg f'{repr(prev.msgid)} in {location(prev)}',
235 1.1.1.2 mrg include_msgid=False)
236 1.1.1.2 mrg
237 1.1.1.2 mrg
238 1.1.1.2 mrg def lint_file(po: polib.POFile):
239 1.1.1.2 mrg for msg in po:
240 1.1.1.2 mrg msg: polib.POEntry
241 1.1.1.2 mrg
242 1.1.1.2 mrg if not msg.obsolete and not msg.fuzzy:
243 1.1.1.2 mrg if 'gcc-internal-format' in msg.flags:
244 1.1.1.2 mrg lint_gcc_internal_format(msg)
245 1.1.1.2 mrg
246 1.1.1.2 mrg lint_diagnostics_differing_only_in_placeholders(po)
247 1.1.1.2 mrg
248 1.1.1.2 mrg
249 1.1.1.2 mrg def main():
250 1.1.1.2 mrg parser = argparse.ArgumentParser(description='')
251 1.1.1.2 mrg parser.add_argument('file', help='pot file')
252 1.1 mrg
253 1.1.1.2 mrg args = parser.parse_args()
254 1.1 mrg
255 1.1.1.2 mrg po = polib.pofile(args.file)
256 1.1.1.2 mrg lint_file(po)
257 1.1 mrg
258 1.1.1.2 mrg print()
259 1.1.1.2 mrg print('summary:')
260 1.1.1.2 mrg for entry in seen_warnings.most_common():
261 1.1.1.2 mrg if entry[1] > 1:
262 1.1.1.2 mrg print(f'{entry[1]}\t{entry[0]}')
263 1.1 mrg
264 1.1 mrg
265 1.1.1.2 mrg if __name__ == '__main__':
266 1.1.1.2 mrg main()
267