rust-demangle.c revision 1.1.1.2 1 1.1 mrg /* Demangler for the Rust programming language
2 1.1.1.2 mrg Copyright (C) 2016-2018 Free Software Foundation, Inc.
3 1.1 mrg Written by David Tolnay (dtolnay (at) gmail.com).
4 1.1 mrg
5 1.1 mrg This file is part of the libiberty library.
6 1.1 mrg Libiberty is free software; you can redistribute it and/or
7 1.1 mrg modify it under the terms of the GNU Library General Public
8 1.1 mrg License as published by the Free Software Foundation; either
9 1.1 mrg version 2 of the License, or (at your option) any later version.
10 1.1 mrg
11 1.1 mrg In addition to the permissions in the GNU Library General Public
12 1.1 mrg License, the Free Software Foundation gives you unlimited permission
13 1.1 mrg to link the compiled version of this file into combinations with other
14 1.1 mrg programs, and to distribute those combinations without any restriction
15 1.1 mrg coming from the use of this file. (The Library Public License
16 1.1 mrg restrictions do apply in other respects; for example, they cover
17 1.1 mrg modification of the file, and distribution when not linked into a
18 1.1 mrg combined executable.)
19 1.1 mrg
20 1.1 mrg Libiberty is distributed in the hope that it will be useful,
21 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
22 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 1.1 mrg Library General Public License for more details.
24 1.1 mrg
25 1.1 mrg You should have received a copy of the GNU Library General Public
26 1.1 mrg License along with libiberty; see the file COPYING.LIB.
27 1.1 mrg If not, see <http://www.gnu.org/licenses/>. */
28 1.1 mrg
29 1.1 mrg
30 1.1 mrg #ifdef HAVE_CONFIG_H
31 1.1 mrg #include "config.h"
32 1.1 mrg #endif
33 1.1 mrg
34 1.1 mrg #include "safe-ctype.h"
35 1.1 mrg
36 1.1 mrg #include <sys/types.h>
37 1.1 mrg #include <string.h>
38 1.1 mrg #include <stdio.h>
39 1.1 mrg
40 1.1 mrg #ifdef HAVE_STRING_H
41 1.1 mrg #include <string.h>
42 1.1 mrg #else
43 1.1 mrg extern size_t strlen(const char *s);
44 1.1 mrg extern int strncmp(const char *s1, const char *s2, size_t n);
45 1.1 mrg extern void *memset(void *s, int c, size_t n);
46 1.1 mrg #endif
47 1.1 mrg
48 1.1 mrg #include <demangle.h>
49 1.1 mrg #include "libiberty.h"
50 1.1 mrg
51 1.1 mrg
52 1.1 mrg /* Mangled Rust symbols look like this:
53 1.1 mrg _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
54 1.1 mrg
55 1.1 mrg The original symbol is:
56 1.1 mrg <std::sys::fd::FileDesc as core::ops::Drop>::drop
57 1.1 mrg
58 1.1 mrg The last component of the path is a 64-bit hash in lowercase hex,
59 1.1 mrg prefixed with "h". Rust does not have a global namespace between
60 1.1 mrg crates, an illusion which Rust maintains by using the hash to
61 1.1 mrg distinguish things that would otherwise have the same symbol.
62 1.1 mrg
63 1.1 mrg Any path component not starting with a XID_Start character is
64 1.1 mrg prefixed with "_".
65 1.1 mrg
66 1.1 mrg The following escape sequences are used:
67 1.1 mrg
68 1.1 mrg "," => $C$
69 1.1 mrg "@" => $SP$
70 1.1 mrg "*" => $BP$
71 1.1 mrg "&" => $RF$
72 1.1 mrg "<" => $LT$
73 1.1 mrg ">" => $GT$
74 1.1 mrg "(" => $LP$
75 1.1 mrg ")" => $RP$
76 1.1 mrg " " => $u20$
77 1.1 mrg "\"" => $u22$
78 1.1 mrg "'" => $u27$
79 1.1 mrg "+" => $u2b$
80 1.1 mrg ";" => $u3b$
81 1.1 mrg "[" => $u5b$
82 1.1 mrg "]" => $u5d$
83 1.1 mrg "{" => $u7b$
84 1.1 mrg "}" => $u7d$
85 1.1 mrg "~" => $u7e$
86 1.1 mrg
87 1.1 mrg A double ".." means "::" and a single "." means "-".
88 1.1 mrg
89 1.1 mrg The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ */
90 1.1 mrg
91 1.1 mrg static const char *hash_prefix = "::h";
92 1.1 mrg static const size_t hash_prefix_len = 3;
93 1.1 mrg static const size_t hash_len = 16;
94 1.1 mrg
95 1.1 mrg static int is_prefixed_hash (const char *start);
96 1.1 mrg static int looks_like_rust (const char *sym, size_t len);
97 1.1 mrg static int unescape (const char **in, char **out, const char *seq, char value);
98 1.1 mrg
99 1.1 mrg /* INPUT: sym: symbol that has been through C++ (gnu v3) demangling
100 1.1 mrg
101 1.1 mrg This function looks for the following indicators:
102 1.1 mrg
103 1.1 mrg 1. The hash must consist of "h" followed by 16 lowercase hex digits.
104 1.1 mrg
105 1.1 mrg 2. As a sanity check, the hash must use between 5 and 15 of the 16
106 1.1 mrg possible hex digits. This is true of 99.9998% of hashes so once
107 1.1 mrg in your life you may see a false negative. The point is to
108 1.1 mrg notice path components that could be Rust hashes but are
109 1.1 mrg probably not, like "haaaaaaaaaaaaaaaa". In this case a false
110 1.1 mrg positive (non-Rust symbol has an important path component
111 1.1 mrg removed because it looks like a Rust hash) is worse than a false
112 1.1 mrg negative (the rare Rust symbol is not demangled) so this sets
113 1.1 mrg the balance in favor of false negatives.
114 1.1 mrg
115 1.1 mrg 3. There must be no characters other than a-zA-Z0-9 and _.:$
116 1.1 mrg
117 1.1 mrg 4. There must be no unrecognized $-sign sequences.
118 1.1 mrg
119 1.1 mrg 5. There must be no sequence of three or more dots in a row ("..."). */
120 1.1 mrg
121 1.1 mrg int
122 1.1 mrg rust_is_mangled (const char *sym)
123 1.1 mrg {
124 1.1 mrg size_t len, len_without_hash;
125 1.1 mrg
126 1.1 mrg if (!sym)
127 1.1 mrg return 0;
128 1.1 mrg
129 1.1 mrg len = strlen (sym);
130 1.1 mrg if (len <= hash_prefix_len + hash_len)
131 1.1 mrg /* Not long enough to contain "::h" + hash + something else */
132 1.1 mrg return 0;
133 1.1 mrg
134 1.1 mrg len_without_hash = len - (hash_prefix_len + hash_len);
135 1.1 mrg if (!is_prefixed_hash (sym + len_without_hash))
136 1.1 mrg return 0;
137 1.1 mrg
138 1.1 mrg return looks_like_rust (sym, len_without_hash);
139 1.1 mrg }
140 1.1 mrg
141 1.1 mrg /* A hash is the prefix "::h" followed by 16 lowercase hex digits. The
142 1.1 mrg hex digits must comprise between 5 and 15 (inclusive) distinct
143 1.1 mrg digits. */
144 1.1 mrg
145 1.1 mrg static int
146 1.1 mrg is_prefixed_hash (const char *str)
147 1.1 mrg {
148 1.1 mrg const char *end;
149 1.1 mrg char seen[16];
150 1.1 mrg size_t i;
151 1.1 mrg int count;
152 1.1 mrg
153 1.1 mrg if (strncmp (str, hash_prefix, hash_prefix_len))
154 1.1 mrg return 0;
155 1.1 mrg str += hash_prefix_len;
156 1.1 mrg
157 1.1 mrg memset (seen, 0, sizeof(seen));
158 1.1 mrg for (end = str + hash_len; str < end; str++)
159 1.1 mrg if (*str >= '0' && *str <= '9')
160 1.1 mrg seen[*str - '0'] = 1;
161 1.1 mrg else if (*str >= 'a' && *str <= 'f')
162 1.1 mrg seen[*str - 'a' + 10] = 1;
163 1.1 mrg else
164 1.1 mrg return 0;
165 1.1 mrg
166 1.1 mrg /* Count how many distinct digits seen */
167 1.1 mrg count = 0;
168 1.1 mrg for (i = 0; i < 16; i++)
169 1.1 mrg if (seen[i])
170 1.1 mrg count++;
171 1.1 mrg
172 1.1 mrg return count >= 5 && count <= 15;
173 1.1 mrg }
174 1.1 mrg
175 1.1 mrg static int
176 1.1 mrg looks_like_rust (const char *str, size_t len)
177 1.1 mrg {
178 1.1 mrg const char *end = str + len;
179 1.1 mrg
180 1.1 mrg while (str < end)
181 1.1 mrg switch (*str)
182 1.1 mrg {
183 1.1 mrg case '$':
184 1.1 mrg if (!strncmp (str, "$C$", 3))
185 1.1 mrg str += 3;
186 1.1 mrg else if (!strncmp (str, "$SP$", 4)
187 1.1 mrg || !strncmp (str, "$BP$", 4)
188 1.1 mrg || !strncmp (str, "$RF$", 4)
189 1.1 mrg || !strncmp (str, "$LT$", 4)
190 1.1 mrg || !strncmp (str, "$GT$", 4)
191 1.1 mrg || !strncmp (str, "$LP$", 4)
192 1.1 mrg || !strncmp (str, "$RP$", 4))
193 1.1 mrg str += 4;
194 1.1 mrg else if (!strncmp (str, "$u20$", 5)
195 1.1 mrg || !strncmp (str, "$u22$", 5)
196 1.1 mrg || !strncmp (str, "$u27$", 5)
197 1.1 mrg || !strncmp (str, "$u2b$", 5)
198 1.1 mrg || !strncmp (str, "$u3b$", 5)
199 1.1 mrg || !strncmp (str, "$u5b$", 5)
200 1.1 mrg || !strncmp (str, "$u5d$", 5)
201 1.1 mrg || !strncmp (str, "$u7b$", 5)
202 1.1 mrg || !strncmp (str, "$u7d$", 5)
203 1.1 mrg || !strncmp (str, "$u7e$", 5))
204 1.1 mrg str += 5;
205 1.1 mrg else
206 1.1 mrg return 0;
207 1.1 mrg break;
208 1.1 mrg case '.':
209 1.1 mrg /* Do not allow three or more consecutive dots */
210 1.1 mrg if (!strncmp (str, "...", 3))
211 1.1 mrg return 0;
212 1.1 mrg /* Fall through */
213 1.1 mrg case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
214 1.1 mrg case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
215 1.1 mrg case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
216 1.1 mrg case 's': case 't': case 'u': case 'v': case 'w': case 'x':
217 1.1 mrg case 'y': case 'z':
218 1.1 mrg case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
219 1.1 mrg case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
220 1.1 mrg case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
221 1.1 mrg case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
222 1.1 mrg case 'Y': case 'Z':
223 1.1 mrg case '0': case '1': case '2': case '3': case '4': case '5':
224 1.1 mrg case '6': case '7': case '8': case '9':
225 1.1 mrg case '_':
226 1.1 mrg case ':':
227 1.1 mrg str++;
228 1.1 mrg break;
229 1.1 mrg default:
230 1.1 mrg return 0;
231 1.1 mrg }
232 1.1 mrg
233 1.1 mrg return 1;
234 1.1 mrg }
235 1.1 mrg
236 1.1 mrg /*
237 1.1 mrg INPUT: sym: symbol for which rust_is_mangled(sym) returned 1.
238 1.1 mrg
239 1.1 mrg The input is demangled in-place because the mangled name is always
240 1.1 mrg longer than the demangled one. */
241 1.1 mrg
242 1.1 mrg void
243 1.1 mrg rust_demangle_sym (char *sym)
244 1.1 mrg {
245 1.1 mrg const char *in;
246 1.1 mrg char *out;
247 1.1 mrg const char *end;
248 1.1 mrg
249 1.1 mrg if (!sym)
250 1.1 mrg return;
251 1.1 mrg
252 1.1 mrg in = sym;
253 1.1 mrg out = sym;
254 1.1 mrg end = sym + strlen (sym) - (hash_prefix_len + hash_len);
255 1.1 mrg
256 1.1 mrg while (in < end)
257 1.1 mrg switch (*in)
258 1.1 mrg {
259 1.1 mrg case '$':
260 1.1 mrg if (!(unescape (&in, &out, "$C$", ',')
261 1.1 mrg || unescape (&in, &out, "$SP$", '@')
262 1.1 mrg || unescape (&in, &out, "$BP$", '*')
263 1.1 mrg || unescape (&in, &out, "$RF$", '&')
264 1.1 mrg || unescape (&in, &out, "$LT$", '<')
265 1.1 mrg || unescape (&in, &out, "$GT$", '>')
266 1.1 mrg || unescape (&in, &out, "$LP$", '(')
267 1.1 mrg || unescape (&in, &out, "$RP$", ')')
268 1.1 mrg || unescape (&in, &out, "$u20$", ' ')
269 1.1 mrg || unescape (&in, &out, "$u22$", '\"')
270 1.1 mrg || unescape (&in, &out, "$u27$", '\'')
271 1.1 mrg || unescape (&in, &out, "$u2b$", '+')
272 1.1 mrg || unescape (&in, &out, "$u3b$", ';')
273 1.1 mrg || unescape (&in, &out, "$u5b$", '[')
274 1.1 mrg || unescape (&in, &out, "$u5d$", ']')
275 1.1 mrg || unescape (&in, &out, "$u7b$", '{')
276 1.1 mrg || unescape (&in, &out, "$u7d$", '}')
277 1.1 mrg || unescape (&in, &out, "$u7e$", '~'))) {
278 1.1 mrg /* unexpected escape sequence, not looks_like_rust. */
279 1.1 mrg goto fail;
280 1.1 mrg }
281 1.1 mrg break;
282 1.1 mrg case '_':
283 1.1 mrg /* If this is the start of a path component and the next
284 1.1 mrg character is an escape sequence, ignore the underscore. The
285 1.1 mrg mangler inserts an underscore to make sure the path
286 1.1 mrg component begins with a XID_Start character. */
287 1.1 mrg if ((in == sym || in[-1] == ':') && in[1] == '$')
288 1.1 mrg in++;
289 1.1 mrg else
290 1.1 mrg *out++ = *in++;
291 1.1 mrg break;
292 1.1 mrg case '.':
293 1.1 mrg if (in[1] == '.')
294 1.1 mrg {
295 1.1 mrg /* ".." becomes "::" */
296 1.1 mrg *out++ = ':';
297 1.1 mrg *out++ = ':';
298 1.1 mrg in += 2;
299 1.1 mrg }
300 1.1 mrg else
301 1.1 mrg {
302 1.1 mrg /* "." becomes "-" */
303 1.1 mrg *out++ = '-';
304 1.1 mrg in++;
305 1.1 mrg }
306 1.1 mrg break;
307 1.1 mrg case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
308 1.1 mrg case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
309 1.1 mrg case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
310 1.1 mrg case 's': case 't': case 'u': case 'v': case 'w': case 'x':
311 1.1 mrg case 'y': case 'z':
312 1.1 mrg case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
313 1.1 mrg case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
314 1.1 mrg case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
315 1.1 mrg case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
316 1.1 mrg case 'Y': case 'Z':
317 1.1 mrg case '0': case '1': case '2': case '3': case '4': case '5':
318 1.1 mrg case '6': case '7': case '8': case '9':
319 1.1 mrg case ':':
320 1.1 mrg *out++ = *in++;
321 1.1 mrg break;
322 1.1 mrg default:
323 1.1 mrg /* unexpected character in symbol, not looks_like_rust. */
324 1.1 mrg goto fail;
325 1.1 mrg }
326 1.1 mrg goto done;
327 1.1 mrg
328 1.1 mrg fail:
329 1.1 mrg *out++ = '?'; /* This is pretty lame, but it's hard to do better. */
330 1.1 mrg done:
331 1.1 mrg *out = '\0';
332 1.1 mrg }
333 1.1 mrg
334 1.1 mrg static int
335 1.1 mrg unescape (const char **in, char **out, const char *seq, char value)
336 1.1 mrg {
337 1.1 mrg size_t len = strlen (seq);
338 1.1 mrg
339 1.1 mrg if (strncmp (*in, seq, len))
340 1.1 mrg return 0;
341 1.1 mrg
342 1.1 mrg **out = value;
343 1.1 mrg
344 1.1 mrg *in += len;
345 1.1 mrg *out += 1;
346 1.1 mrg
347 1.1 mrg return 1;
348 1.1 mrg }
349