rust-demangle.c revision 1.1 1 1.1 christos /* Demangler for the Rust programming language
2 1.1 christos Copyright (C) 2016-2017 Free Software Foundation, Inc.
3 1.1 christos Written by David Tolnay (dtolnay (at) gmail.com).
4 1.1 christos
5 1.1 christos This file is part of the libiberty library.
6 1.1 christos Libiberty is free software; you can redistribute it and/or
7 1.1 christos modify it under the terms of the GNU Library General Public
8 1.1 christos License as published by the Free Software Foundation; either
9 1.1 christos version 2 of the License, or (at your option) any later version.
10 1.1 christos
11 1.1 christos In addition to the permissions in the GNU Library General Public
12 1.1 christos License, the Free Software Foundation gives you unlimited permission
13 1.1 christos to link the compiled version of this file into combinations with other
14 1.1 christos programs, and to distribute those combinations without any restriction
15 1.1 christos coming from the use of this file. (The Library Public License
16 1.1 christos restrictions do apply in other respects; for example, they cover
17 1.1 christos modification of the file, and distribution when not linked into a
18 1.1 christos combined executable.)
19 1.1 christos
20 1.1 christos Libiberty is distributed in the hope that it will be useful,
21 1.1 christos but WITHOUT ANY WARRANTY; without even the implied warranty of
22 1.1 christos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 1.1 christos Library General Public License for more details.
24 1.1 christos
25 1.1 christos You should have received a copy of the GNU Library General Public
26 1.1 christos License along with libiberty; see the file COPYING.LIB.
27 1.1 christos If not, see <http://www.gnu.org/licenses/>. */
28 1.1 christos
29 1.1 christos
30 1.1 christos #ifdef HAVE_CONFIG_H
31 1.1 christos #include "config.h"
32 1.1 christos #endif
33 1.1 christos
34 1.1 christos #include "safe-ctype.h"
35 1.1 christos
36 1.1 christos #include <sys/types.h>
37 1.1 christos #include <string.h>
38 1.1 christos #include <stdio.h>
39 1.1 christos
40 1.1 christos #ifdef HAVE_STRING_H
41 1.1 christos #include <string.h>
42 1.1 christos #else
43 1.1 christos extern size_t strlen(const char *s);
44 1.1 christos extern int strncmp(const char *s1, const char *s2, size_t n);
45 1.1 christos extern void *memset(void *s, int c, size_t n);
46 1.1 christos #endif
47 1.1 christos
48 1.1 christos #include <demangle.h>
49 1.1 christos #include "libiberty.h"
50 1.1 christos
51 1.1 christos
52 1.1 christos /* Mangled Rust symbols look like this:
53 1.1 christos _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
54 1.1 christos
55 1.1 christos The original symbol is:
56 1.1 christos <std::sys::fd::FileDesc as core::ops::Drop>::drop
57 1.1 christos
58 1.1 christos The last component of the path is a 64-bit hash in lowercase hex,
59 1.1 christos prefixed with "h". Rust does not have a global namespace between
60 1.1 christos crates, an illusion which Rust maintains by using the hash to
61 1.1 christos distinguish things that would otherwise have the same symbol.
62 1.1 christos
63 1.1 christos Any path component not starting with a XID_Start character is
64 1.1 christos prefixed with "_".
65 1.1 christos
66 1.1 christos The following escape sequences are used:
67 1.1 christos
68 1.1 christos "," => $C$
69 1.1 christos "@" => $SP$
70 1.1 christos "*" => $BP$
71 1.1 christos "&" => $RF$
72 1.1 christos "<" => $LT$
73 1.1 christos ">" => $GT$
74 1.1 christos "(" => $LP$
75 1.1 christos ")" => $RP$
76 1.1 christos " " => $u20$
77 1.1 christos "\"" => $u22$
78 1.1 christos "'" => $u27$
79 1.1 christos "+" => $u2b$
80 1.1 christos ";" => $u3b$
81 1.1 christos "[" => $u5b$
82 1.1 christos "]" => $u5d$
83 1.1 christos "{" => $u7b$
84 1.1 christos "}" => $u7d$
85 1.1 christos "~" => $u7e$
86 1.1 christos
87 1.1 christos A double ".." means "::" and a single "." means "-".
88 1.1 christos
89 1.1 christos The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ */
90 1.1 christos
91 1.1 christos static const char *hash_prefix = "::h";
92 1.1 christos static const size_t hash_prefix_len = 3;
93 1.1 christos static const size_t hash_len = 16;
94 1.1 christos
95 1.1 christos static int is_prefixed_hash (const char *start);
96 1.1 christos static int looks_like_rust (const char *sym, size_t len);
97 1.1 christos static int unescape (const char **in, char **out, const char *seq, char value);
98 1.1 christos
99 1.1 christos /* INPUT: sym: symbol that has been through C++ (gnu v3) demangling
100 1.1 christos
101 1.1 christos This function looks for the following indicators:
102 1.1 christos
103 1.1 christos 1. The hash must consist of "h" followed by 16 lowercase hex digits.
104 1.1 christos
105 1.1 christos 2. As a sanity check, the hash must use between 5 and 15 of the 16
106 1.1 christos possible hex digits. This is true of 99.9998% of hashes so once
107 1.1 christos in your life you may see a false negative. The point is to
108 1.1 christos notice path components that could be Rust hashes but are
109 1.1 christos probably not, like "haaaaaaaaaaaaaaaa". In this case a false
110 1.1 christos positive (non-Rust symbol has an important path component
111 1.1 christos removed because it looks like a Rust hash) is worse than a false
112 1.1 christos negative (the rare Rust symbol is not demangled) so this sets
113 1.1 christos the balance in favor of false negatives.
114 1.1 christos
115 1.1 christos 3. There must be no characters other than a-zA-Z0-9 and _.:$
116 1.1 christos
117 1.1 christos 4. There must be no unrecognized $-sign sequences.
118 1.1 christos
119 1.1 christos 5. There must be no sequence of three or more dots in a row ("..."). */
120 1.1 christos
121 1.1 christos int
122 1.1 christos rust_is_mangled (const char *sym)
123 1.1 christos {
124 1.1 christos size_t len, len_without_hash;
125 1.1 christos
126 1.1 christos if (!sym)
127 1.1 christos return 0;
128 1.1 christos
129 1.1 christos len = strlen (sym);
130 1.1 christos if (len <= hash_prefix_len + hash_len)
131 1.1 christos /* Not long enough to contain "::h" + hash + something else */
132 1.1 christos return 0;
133 1.1 christos
134 1.1 christos len_without_hash = len - (hash_prefix_len + hash_len);
135 1.1 christos if (!is_prefixed_hash (sym + len_without_hash))
136 1.1 christos return 0;
137 1.1 christos
138 1.1 christos return looks_like_rust (sym, len_without_hash);
139 1.1 christos }
140 1.1 christos
141 1.1 christos /* A hash is the prefix "::h" followed by 16 lowercase hex digits. The
142 1.1 christos hex digits must comprise between 5 and 15 (inclusive) distinct
143 1.1 christos digits. */
144 1.1 christos
145 1.1 christos static int
146 1.1 christos is_prefixed_hash (const char *str)
147 1.1 christos {
148 1.1 christos const char *end;
149 1.1 christos char seen[16];
150 1.1 christos size_t i;
151 1.1 christos int count;
152 1.1 christos
153 1.1 christos if (strncmp (str, hash_prefix, hash_prefix_len))
154 1.1 christos return 0;
155 1.1 christos str += hash_prefix_len;
156 1.1 christos
157 1.1 christos memset (seen, 0, sizeof(seen));
158 1.1 christos for (end = str + hash_len; str < end; str++)
159 1.1 christos if (*str >= '0' && *str <= '9')
160 1.1 christos seen[*str - '0'] = 1;
161 1.1 christos else if (*str >= 'a' && *str <= 'f')
162 1.1 christos seen[*str - 'a' + 10] = 1;
163 1.1 christos else
164 1.1 christos return 0;
165 1.1 christos
166 1.1 christos /* Count how many distinct digits seen */
167 1.1 christos count = 0;
168 1.1 christos for (i = 0; i < 16; i++)
169 1.1 christos if (seen[i])
170 1.1 christos count++;
171 1.1 christos
172 1.1 christos return count >= 5 && count <= 15;
173 1.1 christos }
174 1.1 christos
175 1.1 christos static int
176 1.1 christos looks_like_rust (const char *str, size_t len)
177 1.1 christos {
178 1.1 christos const char *end = str + len;
179 1.1 christos
180 1.1 christos while (str < end)
181 1.1 christos switch (*str)
182 1.1 christos {
183 1.1 christos case '$':
184 1.1 christos if (!strncmp (str, "$C$", 3))
185 1.1 christos str += 3;
186 1.1 christos else if (!strncmp (str, "$SP$", 4)
187 1.1 christos || !strncmp (str, "$BP$", 4)
188 1.1 christos || !strncmp (str, "$RF$", 4)
189 1.1 christos || !strncmp (str, "$LT$", 4)
190 1.1 christos || !strncmp (str, "$GT$", 4)
191 1.1 christos || !strncmp (str, "$LP$", 4)
192 1.1 christos || !strncmp (str, "$RP$", 4))
193 1.1 christos str += 4;
194 1.1 christos else if (!strncmp (str, "$u20$", 5)
195 1.1 christos || !strncmp (str, "$u22$", 5)
196 1.1 christos || !strncmp (str, "$u27$", 5)
197 1.1 christos || !strncmp (str, "$u2b$", 5)
198 1.1 christos || !strncmp (str, "$u3b$", 5)
199 1.1 christos || !strncmp (str, "$u5b$", 5)
200 1.1 christos || !strncmp (str, "$u5d$", 5)
201 1.1 christos || !strncmp (str, "$u7b$", 5)
202 1.1 christos || !strncmp (str, "$u7d$", 5)
203 1.1 christos || !strncmp (str, "$u7e$", 5))
204 1.1 christos str += 5;
205 1.1 christos else
206 1.1 christos return 0;
207 1.1 christos break;
208 1.1 christos case '.':
209 1.1 christos /* Do not allow three or more consecutive dots */
210 1.1 christos if (!strncmp (str, "...", 3))
211 1.1 christos return 0;
212 1.1 christos /* Fall through */
213 1.1 christos case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
214 1.1 christos case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
215 1.1 christos case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
216 1.1 christos case 's': case 't': case 'u': case 'v': case 'w': case 'x':
217 1.1 christos case 'y': case 'z':
218 1.1 christos case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
219 1.1 christos case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
220 1.1 christos case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
221 1.1 christos case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
222 1.1 christos case 'Y': case 'Z':
223 1.1 christos case '0': case '1': case '2': case '3': case '4': case '5':
224 1.1 christos case '6': case '7': case '8': case '9':
225 1.1 christos case '_':
226 1.1 christos case ':':
227 1.1 christos str++;
228 1.1 christos break;
229 1.1 christos default:
230 1.1 christos return 0;
231 1.1 christos }
232 1.1 christos
233 1.1 christos return 1;
234 1.1 christos }
235 1.1 christos
236 1.1 christos /*
237 1.1 christos INPUT: sym: symbol for which rust_is_mangled(sym) returned 1.
238 1.1 christos
239 1.1 christos The input is demangled in-place because the mangled name is always
240 1.1 christos longer than the demangled one. */
241 1.1 christos
242 1.1 christos void
243 1.1 christos rust_demangle_sym (char *sym)
244 1.1 christos {
245 1.1 christos const char *in;
246 1.1 christos char *out;
247 1.1 christos const char *end;
248 1.1 christos
249 1.1 christos if (!sym)
250 1.1 christos return;
251 1.1 christos
252 1.1 christos in = sym;
253 1.1 christos out = sym;
254 1.1 christos end = sym + strlen (sym) - (hash_prefix_len + hash_len);
255 1.1 christos
256 1.1 christos while (in < end)
257 1.1 christos switch (*in)
258 1.1 christos {
259 1.1 christos case '$':
260 1.1 christos if (!(unescape (&in, &out, "$C$", ',')
261 1.1 christos || unescape (&in, &out, "$SP$", '@')
262 1.1 christos || unescape (&in, &out, "$BP$", '*')
263 1.1 christos || unescape (&in, &out, "$RF$", '&')
264 1.1 christos || unescape (&in, &out, "$LT$", '<')
265 1.1 christos || unescape (&in, &out, "$GT$", '>')
266 1.1 christos || unescape (&in, &out, "$LP$", '(')
267 1.1 christos || unescape (&in, &out, "$RP$", ')')
268 1.1 christos || unescape (&in, &out, "$u20$", ' ')
269 1.1 christos || unescape (&in, &out, "$u22$", '\"')
270 1.1 christos || unescape (&in, &out, "$u27$", '\'')
271 1.1 christos || unescape (&in, &out, "$u2b$", '+')
272 1.1 christos || unescape (&in, &out, "$u3b$", ';')
273 1.1 christos || unescape (&in, &out, "$u5b$", '[')
274 1.1 christos || unescape (&in, &out, "$u5d$", ']')
275 1.1 christos || unescape (&in, &out, "$u7b$", '{')
276 1.1 christos || unescape (&in, &out, "$u7d$", '}')
277 1.1 christos || unescape (&in, &out, "$u7e$", '~'))) {
278 1.1 christos /* unexpected escape sequence, not looks_like_rust. */
279 1.1 christos goto fail;
280 1.1 christos }
281 1.1 christos break;
282 1.1 christos case '_':
283 1.1 christos /* If this is the start of a path component and the next
284 1.1 christos character is an escape sequence, ignore the underscore. The
285 1.1 christos mangler inserts an underscore to make sure the path
286 1.1 christos component begins with a XID_Start character. */
287 1.1 christos if ((in == sym || in[-1] == ':') && in[1] == '$')
288 1.1 christos in++;
289 1.1 christos else
290 1.1 christos *out++ = *in++;
291 1.1 christos break;
292 1.1 christos case '.':
293 1.1 christos if (in[1] == '.')
294 1.1 christos {
295 1.1 christos /* ".." becomes "::" */
296 1.1 christos *out++ = ':';
297 1.1 christos *out++ = ':';
298 1.1 christos in += 2;
299 1.1 christos }
300 1.1 christos else
301 1.1 christos {
302 1.1 christos /* "." becomes "-" */
303 1.1 christos *out++ = '-';
304 1.1 christos in++;
305 1.1 christos }
306 1.1 christos break;
307 1.1 christos case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
308 1.1 christos case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
309 1.1 christos case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
310 1.1 christos case 's': case 't': case 'u': case 'v': case 'w': case 'x':
311 1.1 christos case 'y': case 'z':
312 1.1 christos case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
313 1.1 christos case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
314 1.1 christos case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
315 1.1 christos case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
316 1.1 christos case 'Y': case 'Z':
317 1.1 christos case '0': case '1': case '2': case '3': case '4': case '5':
318 1.1 christos case '6': case '7': case '8': case '9':
319 1.1 christos case ':':
320 1.1 christos *out++ = *in++;
321 1.1 christos break;
322 1.1 christos default:
323 1.1 christos /* unexpected character in symbol, not looks_like_rust. */
324 1.1 christos goto fail;
325 1.1 christos }
326 1.1 christos goto done;
327 1.1 christos
328 1.1 christos fail:
329 1.1 christos *out++ = '?'; /* This is pretty lame, but it's hard to do better. */
330 1.1 christos done:
331 1.1 christos *out = '\0';
332 1.1 christos }
333 1.1 christos
334 1.1 christos static int
335 1.1 christos unescape (const char **in, char **out, const char *seq, char value)
336 1.1 christos {
337 1.1 christos size_t len = strlen (seq);
338 1.1 christos
339 1.1 christos if (strncmp (*in, seq, len))
340 1.1 christos return 0;
341 1.1 christos
342 1.1 christos **out = value;
343 1.1 christos
344 1.1 christos *in += len;
345 1.1 christos *out += 1;
346 1.1 christos
347 1.1 christos return 1;
348 1.1 christos }
349