Home | History | Annotate | Line # | Download | only in libiberty
rust-demangle.c revision 1.1
      1  1.1  christos /* Demangler for the Rust programming language
      2  1.1  christos    Copyright (C) 2016-2017 Free Software Foundation, Inc.
      3  1.1  christos    Written by David Tolnay (dtolnay (at) gmail.com).
      4  1.1  christos 
      5  1.1  christos This file is part of the libiberty library.
      6  1.1  christos Libiberty is free software; you can redistribute it and/or
      7  1.1  christos modify it under the terms of the GNU Library General Public
      8  1.1  christos License as published by the Free Software Foundation; either
      9  1.1  christos version 2 of the License, or (at your option) any later version.
     10  1.1  christos 
     11  1.1  christos In addition to the permissions in the GNU Library General Public
     12  1.1  christos License, the Free Software Foundation gives you unlimited permission
     13  1.1  christos to link the compiled version of this file into combinations with other
     14  1.1  christos programs, and to distribute those combinations without any restriction
     15  1.1  christos coming from the use of this file.  (The Library Public License
     16  1.1  christos restrictions do apply in other respects; for example, they cover
     17  1.1  christos modification of the file, and distribution when not linked into a
     18  1.1  christos combined executable.)
     19  1.1  christos 
     20  1.1  christos Libiberty is distributed in the hope that it will be useful,
     21  1.1  christos but WITHOUT ANY WARRANTY; without even the implied warranty of
     22  1.1  christos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23  1.1  christos Library General Public License for more details.
     24  1.1  christos 
     25  1.1  christos You should have received a copy of the GNU Library General Public
     26  1.1  christos License along with libiberty; see the file COPYING.LIB.
     27  1.1  christos If not, see <http://www.gnu.org/licenses/>.  */
     28  1.1  christos 
     29  1.1  christos 
     30  1.1  christos #ifdef HAVE_CONFIG_H
     31  1.1  christos #include "config.h"
     32  1.1  christos #endif
     33  1.1  christos 
     34  1.1  christos #include "safe-ctype.h"
     35  1.1  christos 
     36  1.1  christos #include <sys/types.h>
     37  1.1  christos #include <string.h>
     38  1.1  christos #include <stdio.h>
     39  1.1  christos 
     40  1.1  christos #ifdef HAVE_STRING_H
     41  1.1  christos #include <string.h>
     42  1.1  christos #else
     43  1.1  christos extern size_t strlen(const char *s);
     44  1.1  christos extern int strncmp(const char *s1, const char *s2, size_t n);
     45  1.1  christos extern void *memset(void *s, int c, size_t n);
     46  1.1  christos #endif
     47  1.1  christos 
     48  1.1  christos #include <demangle.h>
     49  1.1  christos #include "libiberty.h"
     50  1.1  christos 
     51  1.1  christos 
     52  1.1  christos /* Mangled Rust symbols look like this:
     53  1.1  christos      _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
     54  1.1  christos 
     55  1.1  christos    The original symbol is:
     56  1.1  christos      <std::sys::fd::FileDesc as core::ops::Drop>::drop
     57  1.1  christos 
     58  1.1  christos    The last component of the path is a 64-bit hash in lowercase hex,
     59  1.1  christos    prefixed with "h". Rust does not have a global namespace between
     60  1.1  christos    crates, an illusion which Rust maintains by using the hash to
     61  1.1  christos    distinguish things that would otherwise have the same symbol.
     62  1.1  christos 
     63  1.1  christos    Any path component not starting with a XID_Start character is
     64  1.1  christos    prefixed with "_".
     65  1.1  christos 
     66  1.1  christos    The following escape sequences are used:
     67  1.1  christos 
     68  1.1  christos    ","  =>  $C$
     69  1.1  christos    "@"  =>  $SP$
     70  1.1  christos    "*"  =>  $BP$
     71  1.1  christos    "&"  =>  $RF$
     72  1.1  christos    "<"  =>  $LT$
     73  1.1  christos    ">"  =>  $GT$
     74  1.1  christos    "("  =>  $LP$
     75  1.1  christos    ")"  =>  $RP$
     76  1.1  christos    " "  =>  $u20$
     77  1.1  christos    "\"" =>  $u22$
     78  1.1  christos    "'"  =>  $u27$
     79  1.1  christos    "+"  =>  $u2b$
     80  1.1  christos    ";"  =>  $u3b$
     81  1.1  christos    "["  =>  $u5b$
     82  1.1  christos    "]"  =>  $u5d$
     83  1.1  christos    "{"  =>  $u7b$
     84  1.1  christos    "}"  =>  $u7d$
     85  1.1  christos    "~"  =>  $u7e$
     86  1.1  christos 
     87  1.1  christos    A double ".." means "::" and a single "." means "-".
     88  1.1  christos 
     89  1.1  christos    The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$  */
     90  1.1  christos 
     91  1.1  christos static const char *hash_prefix = "::h";
     92  1.1  christos static const size_t hash_prefix_len = 3;
     93  1.1  christos static const size_t hash_len = 16;
     94  1.1  christos 
     95  1.1  christos static int is_prefixed_hash (const char *start);
     96  1.1  christos static int looks_like_rust (const char *sym, size_t len);
     97  1.1  christos static int unescape (const char **in, char **out, const char *seq, char value);
     98  1.1  christos 
     99  1.1  christos /* INPUT: sym: symbol that has been through C++ (gnu v3) demangling
    100  1.1  christos 
    101  1.1  christos    This function looks for the following indicators:
    102  1.1  christos 
    103  1.1  christos    1. The hash must consist of "h" followed by 16 lowercase hex digits.
    104  1.1  christos 
    105  1.1  christos    2. As a sanity check, the hash must use between 5 and 15 of the 16
    106  1.1  christos       possible hex digits. This is true of 99.9998% of hashes so once
    107  1.1  christos       in your life you may see a false negative. The point is to
    108  1.1  christos       notice path components that could be Rust hashes but are
    109  1.1  christos       probably not, like "haaaaaaaaaaaaaaaa". In this case a false
    110  1.1  christos       positive (non-Rust symbol has an important path component
    111  1.1  christos       removed because it looks like a Rust hash) is worse than a false
    112  1.1  christos       negative (the rare Rust symbol is not demangled) so this sets
    113  1.1  christos       the balance in favor of false negatives.
    114  1.1  christos 
    115  1.1  christos    3. There must be no characters other than a-zA-Z0-9 and _.:$
    116  1.1  christos 
    117  1.1  christos    4. There must be no unrecognized $-sign sequences.
    118  1.1  christos 
    119  1.1  christos    5. There must be no sequence of three or more dots in a row ("...").  */
    120  1.1  christos 
    121  1.1  christos int
    122  1.1  christos rust_is_mangled (const char *sym)
    123  1.1  christos {
    124  1.1  christos   size_t len, len_without_hash;
    125  1.1  christos 
    126  1.1  christos   if (!sym)
    127  1.1  christos     return 0;
    128  1.1  christos 
    129  1.1  christos   len = strlen (sym);
    130  1.1  christos   if (len <= hash_prefix_len + hash_len)
    131  1.1  christos     /* Not long enough to contain "::h" + hash + something else */
    132  1.1  christos     return 0;
    133  1.1  christos 
    134  1.1  christos   len_without_hash = len - (hash_prefix_len + hash_len);
    135  1.1  christos   if (!is_prefixed_hash (sym + len_without_hash))
    136  1.1  christos     return 0;
    137  1.1  christos 
    138  1.1  christos   return looks_like_rust (sym, len_without_hash);
    139  1.1  christos }
    140  1.1  christos 
    141  1.1  christos /* A hash is the prefix "::h" followed by 16 lowercase hex digits. The
    142  1.1  christos    hex digits must comprise between 5 and 15 (inclusive) distinct
    143  1.1  christos    digits.  */
    144  1.1  christos 
    145  1.1  christos static int
    146  1.1  christos is_prefixed_hash (const char *str)
    147  1.1  christos {
    148  1.1  christos   const char *end;
    149  1.1  christos   char seen[16];
    150  1.1  christos   size_t i;
    151  1.1  christos   int count;
    152  1.1  christos 
    153  1.1  christos   if (strncmp (str, hash_prefix, hash_prefix_len))
    154  1.1  christos     return 0;
    155  1.1  christos   str += hash_prefix_len;
    156  1.1  christos 
    157  1.1  christos   memset (seen, 0, sizeof(seen));
    158  1.1  christos   for (end = str + hash_len; str < end; str++)
    159  1.1  christos     if (*str >= '0' && *str <= '9')
    160  1.1  christos       seen[*str - '0'] = 1;
    161  1.1  christos     else if (*str >= 'a' && *str <= 'f')
    162  1.1  christos       seen[*str - 'a' + 10] = 1;
    163  1.1  christos     else
    164  1.1  christos       return 0;
    165  1.1  christos 
    166  1.1  christos   /* Count how many distinct digits seen */
    167  1.1  christos   count = 0;
    168  1.1  christos   for (i = 0; i < 16; i++)
    169  1.1  christos     if (seen[i])
    170  1.1  christos       count++;
    171  1.1  christos 
    172  1.1  christos   return count >= 5 && count <= 15;
    173  1.1  christos }
    174  1.1  christos 
    175  1.1  christos static int
    176  1.1  christos looks_like_rust (const char *str, size_t len)
    177  1.1  christos {
    178  1.1  christos   const char *end = str + len;
    179  1.1  christos 
    180  1.1  christos   while (str < end)
    181  1.1  christos     switch (*str)
    182  1.1  christos       {
    183  1.1  christos       case '$':
    184  1.1  christos 	if (!strncmp (str, "$C$", 3))
    185  1.1  christos 	  str += 3;
    186  1.1  christos 	else if (!strncmp (str, "$SP$", 4)
    187  1.1  christos 		 || !strncmp (str, "$BP$", 4)
    188  1.1  christos 		 || !strncmp (str, "$RF$", 4)
    189  1.1  christos 		 || !strncmp (str, "$LT$", 4)
    190  1.1  christos 		 || !strncmp (str, "$GT$", 4)
    191  1.1  christos 		 || !strncmp (str, "$LP$", 4)
    192  1.1  christos 		 || !strncmp (str, "$RP$", 4))
    193  1.1  christos 	  str += 4;
    194  1.1  christos 	else if (!strncmp (str, "$u20$", 5)
    195  1.1  christos 		 || !strncmp (str, "$u22$", 5)
    196  1.1  christos 		 || !strncmp (str, "$u27$", 5)
    197  1.1  christos 		 || !strncmp (str, "$u2b$", 5)
    198  1.1  christos 		 || !strncmp (str, "$u3b$", 5)
    199  1.1  christos 		 || !strncmp (str, "$u5b$", 5)
    200  1.1  christos 		 || !strncmp (str, "$u5d$", 5)
    201  1.1  christos 		 || !strncmp (str, "$u7b$", 5)
    202  1.1  christos 		 || !strncmp (str, "$u7d$", 5)
    203  1.1  christos 		 || !strncmp (str, "$u7e$", 5))
    204  1.1  christos 	  str += 5;
    205  1.1  christos 	else
    206  1.1  christos 	  return 0;
    207  1.1  christos 	break;
    208  1.1  christos       case '.':
    209  1.1  christos 	/* Do not allow three or more consecutive dots */
    210  1.1  christos 	if (!strncmp (str, "...", 3))
    211  1.1  christos 	  return 0;
    212  1.1  christos 	/* Fall through */
    213  1.1  christos       case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    214  1.1  christos       case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    215  1.1  christos       case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    216  1.1  christos       case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    217  1.1  christos       case 'y': case 'z':
    218  1.1  christos       case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    219  1.1  christos       case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    220  1.1  christos       case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    221  1.1  christos       case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    222  1.1  christos       case 'Y': case 'Z':
    223  1.1  christos       case '0': case '1': case '2': case '3': case '4': case '5':
    224  1.1  christos       case '6': case '7': case '8': case '9':
    225  1.1  christos       case '_':
    226  1.1  christos       case ':':
    227  1.1  christos 	str++;
    228  1.1  christos 	break;
    229  1.1  christos       default:
    230  1.1  christos 	return 0;
    231  1.1  christos       }
    232  1.1  christos 
    233  1.1  christos   return 1;
    234  1.1  christos }
    235  1.1  christos 
    236  1.1  christos /*
    237  1.1  christos   INPUT: sym: symbol for which rust_is_mangled(sym) returned 1.
    238  1.1  christos 
    239  1.1  christos   The input is demangled in-place because the mangled name is always
    240  1.1  christos   longer than the demangled one.  */
    241  1.1  christos 
    242  1.1  christos void
    243  1.1  christos rust_demangle_sym (char *sym)
    244  1.1  christos {
    245  1.1  christos   const char *in;
    246  1.1  christos   char *out;
    247  1.1  christos   const char *end;
    248  1.1  christos 
    249  1.1  christos   if (!sym)
    250  1.1  christos     return;
    251  1.1  christos 
    252  1.1  christos   in = sym;
    253  1.1  christos   out = sym;
    254  1.1  christos   end = sym + strlen (sym) - (hash_prefix_len + hash_len);
    255  1.1  christos 
    256  1.1  christos   while (in < end)
    257  1.1  christos     switch (*in)
    258  1.1  christos       {
    259  1.1  christos       case '$':
    260  1.1  christos 	if (!(unescape (&in, &out, "$C$", ',')
    261  1.1  christos 	      || unescape (&in, &out, "$SP$", '@')
    262  1.1  christos 	      || unescape (&in, &out, "$BP$", '*')
    263  1.1  christos 	      || unescape (&in, &out, "$RF$", '&')
    264  1.1  christos 	      || unescape (&in, &out, "$LT$", '<')
    265  1.1  christos 	      || unescape (&in, &out, "$GT$", '>')
    266  1.1  christos 	      || unescape (&in, &out, "$LP$", '(')
    267  1.1  christos 	      || unescape (&in, &out, "$RP$", ')')
    268  1.1  christos 	      || unescape (&in, &out, "$u20$", ' ')
    269  1.1  christos 	      || unescape (&in, &out, "$u22$", '\"')
    270  1.1  christos 	      || unescape (&in, &out, "$u27$", '\'')
    271  1.1  christos 	      || unescape (&in, &out, "$u2b$", '+')
    272  1.1  christos 	      || unescape (&in, &out, "$u3b$", ';')
    273  1.1  christos 	      || unescape (&in, &out, "$u5b$", '[')
    274  1.1  christos 	      || unescape (&in, &out, "$u5d$", ']')
    275  1.1  christos 	      || unescape (&in, &out, "$u7b$", '{')
    276  1.1  christos 	      || unescape (&in, &out, "$u7d$", '}')
    277  1.1  christos 	      || unescape (&in, &out, "$u7e$", '~'))) {
    278  1.1  christos 	  /* unexpected escape sequence, not looks_like_rust. */
    279  1.1  christos 	  goto fail;
    280  1.1  christos 	}
    281  1.1  christos 	break;
    282  1.1  christos       case '_':
    283  1.1  christos 	/* If this is the start of a path component and the next
    284  1.1  christos 	   character is an escape sequence, ignore the underscore. The
    285  1.1  christos 	   mangler inserts an underscore to make sure the path
    286  1.1  christos 	   component begins with a XID_Start character. */
    287  1.1  christos 	if ((in == sym || in[-1] == ':') && in[1] == '$')
    288  1.1  christos 	  in++;
    289  1.1  christos 	else
    290  1.1  christos 	  *out++ = *in++;
    291  1.1  christos 	break;
    292  1.1  christos       case '.':
    293  1.1  christos 	if (in[1] == '.')
    294  1.1  christos 	  {
    295  1.1  christos 	    /* ".." becomes "::" */
    296  1.1  christos 	    *out++ = ':';
    297  1.1  christos 	    *out++ = ':';
    298  1.1  christos 	    in += 2;
    299  1.1  christos 	  }
    300  1.1  christos 	else
    301  1.1  christos 	  {
    302  1.1  christos 	    /* "." becomes "-" */
    303  1.1  christos 	    *out++ = '-';
    304  1.1  christos 	    in++;
    305  1.1  christos 	  }
    306  1.1  christos 	break;
    307  1.1  christos       case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    308  1.1  christos       case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    309  1.1  christos       case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    310  1.1  christos       case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    311  1.1  christos       case 'y': case 'z':
    312  1.1  christos       case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    313  1.1  christos       case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    314  1.1  christos       case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    315  1.1  christos       case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    316  1.1  christos       case 'Y': case 'Z':
    317  1.1  christos       case '0': case '1': case '2': case '3': case '4': case '5':
    318  1.1  christos       case '6': case '7': case '8': case '9':
    319  1.1  christos       case ':':
    320  1.1  christos 	*out++ = *in++;
    321  1.1  christos 	break;
    322  1.1  christos       default:
    323  1.1  christos 	/* unexpected character in symbol, not looks_like_rust.  */
    324  1.1  christos 	goto fail;
    325  1.1  christos       }
    326  1.1  christos   goto done;
    327  1.1  christos 
    328  1.1  christos fail:
    329  1.1  christos   *out++ = '?'; /* This is pretty lame, but it's hard to do better. */
    330  1.1  christos done:
    331  1.1  christos   *out = '\0';
    332  1.1  christos }
    333  1.1  christos 
    334  1.1  christos static int
    335  1.1  christos unescape (const char **in, char **out, const char *seq, char value)
    336  1.1  christos {
    337  1.1  christos   size_t len = strlen (seq);
    338  1.1  christos 
    339  1.1  christos   if (strncmp (*in, seq, len))
    340  1.1  christos     return 0;
    341  1.1  christos 
    342  1.1  christos   **out = value;
    343  1.1  christos 
    344  1.1  christos   *in += len;
    345  1.1  christos   *out += 1;
    346  1.1  christos 
    347  1.1  christos   return 1;
    348  1.1  christos }
    349