Home | History | Annotate | Line # | Download | only in libiberty
rust-demangle.c revision 1.1.1.2
      1      1.1  mrg /* Demangler for the Rust programming language
      2  1.1.1.2  mrg    Copyright (C) 2016-2018 Free Software Foundation, Inc.
      3      1.1  mrg    Written by David Tolnay (dtolnay (at) gmail.com).
      4      1.1  mrg 
      5      1.1  mrg This file is part of the libiberty library.
      6      1.1  mrg Libiberty is free software; you can redistribute it and/or
      7      1.1  mrg modify it under the terms of the GNU Library General Public
      8      1.1  mrg License as published by the Free Software Foundation; either
      9      1.1  mrg version 2 of the License, or (at your option) any later version.
     10      1.1  mrg 
     11      1.1  mrg In addition to the permissions in the GNU Library General Public
     12      1.1  mrg License, the Free Software Foundation gives you unlimited permission
     13      1.1  mrg to link the compiled version of this file into combinations with other
     14      1.1  mrg programs, and to distribute those combinations without any restriction
     15      1.1  mrg coming from the use of this file.  (The Library Public License
     16      1.1  mrg restrictions do apply in other respects; for example, they cover
     17      1.1  mrg modification of the file, and distribution when not linked into a
     18      1.1  mrg combined executable.)
     19      1.1  mrg 
     20      1.1  mrg Libiberty is distributed in the hope that it will be useful,
     21      1.1  mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
     22      1.1  mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23      1.1  mrg Library General Public License for more details.
     24      1.1  mrg 
     25      1.1  mrg You should have received a copy of the GNU Library General Public
     26      1.1  mrg License along with libiberty; see the file COPYING.LIB.
     27      1.1  mrg If not, see <http://www.gnu.org/licenses/>.  */
     28      1.1  mrg 
     29      1.1  mrg 
     30      1.1  mrg #ifdef HAVE_CONFIG_H
     31      1.1  mrg #include "config.h"
     32      1.1  mrg #endif
     33      1.1  mrg 
     34      1.1  mrg #include "safe-ctype.h"
     35      1.1  mrg 
     36      1.1  mrg #include <sys/types.h>
     37      1.1  mrg #include <string.h>
     38      1.1  mrg #include <stdio.h>
     39      1.1  mrg 
     40      1.1  mrg #ifdef HAVE_STRING_H
     41      1.1  mrg #include <string.h>
     42      1.1  mrg #else
     43      1.1  mrg extern size_t strlen(const char *s);
     44      1.1  mrg extern int strncmp(const char *s1, const char *s2, size_t n);
     45      1.1  mrg extern void *memset(void *s, int c, size_t n);
     46      1.1  mrg #endif
     47      1.1  mrg 
     48      1.1  mrg #include <demangle.h>
     49      1.1  mrg #include "libiberty.h"
     50      1.1  mrg 
     51      1.1  mrg 
     52      1.1  mrg /* Mangled Rust symbols look like this:
     53      1.1  mrg      _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
     54      1.1  mrg 
     55      1.1  mrg    The original symbol is:
     56      1.1  mrg      <std::sys::fd::FileDesc as core::ops::Drop>::drop
     57      1.1  mrg 
     58      1.1  mrg    The last component of the path is a 64-bit hash in lowercase hex,
     59      1.1  mrg    prefixed with "h". Rust does not have a global namespace between
     60      1.1  mrg    crates, an illusion which Rust maintains by using the hash to
     61      1.1  mrg    distinguish things that would otherwise have the same symbol.
     62      1.1  mrg 
     63      1.1  mrg    Any path component not starting with a XID_Start character is
     64      1.1  mrg    prefixed with "_".
     65      1.1  mrg 
     66      1.1  mrg    The following escape sequences are used:
     67      1.1  mrg 
     68      1.1  mrg    ","  =>  $C$
     69      1.1  mrg    "@"  =>  $SP$
     70      1.1  mrg    "*"  =>  $BP$
     71      1.1  mrg    "&"  =>  $RF$
     72      1.1  mrg    "<"  =>  $LT$
     73      1.1  mrg    ">"  =>  $GT$
     74      1.1  mrg    "("  =>  $LP$
     75      1.1  mrg    ")"  =>  $RP$
     76      1.1  mrg    " "  =>  $u20$
     77      1.1  mrg    "\"" =>  $u22$
     78      1.1  mrg    "'"  =>  $u27$
     79      1.1  mrg    "+"  =>  $u2b$
     80      1.1  mrg    ";"  =>  $u3b$
     81      1.1  mrg    "["  =>  $u5b$
     82      1.1  mrg    "]"  =>  $u5d$
     83      1.1  mrg    "{"  =>  $u7b$
     84      1.1  mrg    "}"  =>  $u7d$
     85      1.1  mrg    "~"  =>  $u7e$
     86      1.1  mrg 
     87      1.1  mrg    A double ".." means "::" and a single "." means "-".
     88      1.1  mrg 
     89      1.1  mrg    The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$  */
     90      1.1  mrg 
     91      1.1  mrg static const char *hash_prefix = "::h";
     92      1.1  mrg static const size_t hash_prefix_len = 3;
     93      1.1  mrg static const size_t hash_len = 16;
     94      1.1  mrg 
     95      1.1  mrg static int is_prefixed_hash (const char *start);
     96      1.1  mrg static int looks_like_rust (const char *sym, size_t len);
     97      1.1  mrg static int unescape (const char **in, char **out, const char *seq, char value);
     98      1.1  mrg 
     99      1.1  mrg /* INPUT: sym: symbol that has been through C++ (gnu v3) demangling
    100      1.1  mrg 
    101      1.1  mrg    This function looks for the following indicators:
    102      1.1  mrg 
    103      1.1  mrg    1. The hash must consist of "h" followed by 16 lowercase hex digits.
    104      1.1  mrg 
    105      1.1  mrg    2. As a sanity check, the hash must use between 5 and 15 of the 16
    106      1.1  mrg       possible hex digits. This is true of 99.9998% of hashes so once
    107      1.1  mrg       in your life you may see a false negative. The point is to
    108      1.1  mrg       notice path components that could be Rust hashes but are
    109      1.1  mrg       probably not, like "haaaaaaaaaaaaaaaa". In this case a false
    110      1.1  mrg       positive (non-Rust symbol has an important path component
    111      1.1  mrg       removed because it looks like a Rust hash) is worse than a false
    112      1.1  mrg       negative (the rare Rust symbol is not demangled) so this sets
    113      1.1  mrg       the balance in favor of false negatives.
    114      1.1  mrg 
    115      1.1  mrg    3. There must be no characters other than a-zA-Z0-9 and _.:$
    116      1.1  mrg 
    117      1.1  mrg    4. There must be no unrecognized $-sign sequences.
    118      1.1  mrg 
    119      1.1  mrg    5. There must be no sequence of three or more dots in a row ("...").  */
    120      1.1  mrg 
    121      1.1  mrg int
    122      1.1  mrg rust_is_mangled (const char *sym)
    123      1.1  mrg {
    124      1.1  mrg   size_t len, len_without_hash;
    125      1.1  mrg 
    126      1.1  mrg   if (!sym)
    127      1.1  mrg     return 0;
    128      1.1  mrg 
    129      1.1  mrg   len = strlen (sym);
    130      1.1  mrg   if (len <= hash_prefix_len + hash_len)
    131      1.1  mrg     /* Not long enough to contain "::h" + hash + something else */
    132      1.1  mrg     return 0;
    133      1.1  mrg 
    134      1.1  mrg   len_without_hash = len - (hash_prefix_len + hash_len);
    135      1.1  mrg   if (!is_prefixed_hash (sym + len_without_hash))
    136      1.1  mrg     return 0;
    137      1.1  mrg 
    138      1.1  mrg   return looks_like_rust (sym, len_without_hash);
    139      1.1  mrg }
    140      1.1  mrg 
    141      1.1  mrg /* A hash is the prefix "::h" followed by 16 lowercase hex digits. The
    142      1.1  mrg    hex digits must comprise between 5 and 15 (inclusive) distinct
    143      1.1  mrg    digits.  */
    144      1.1  mrg 
    145      1.1  mrg static int
    146      1.1  mrg is_prefixed_hash (const char *str)
    147      1.1  mrg {
    148      1.1  mrg   const char *end;
    149      1.1  mrg   char seen[16];
    150      1.1  mrg   size_t i;
    151      1.1  mrg   int count;
    152      1.1  mrg 
    153      1.1  mrg   if (strncmp (str, hash_prefix, hash_prefix_len))
    154      1.1  mrg     return 0;
    155      1.1  mrg   str += hash_prefix_len;
    156      1.1  mrg 
    157      1.1  mrg   memset (seen, 0, sizeof(seen));
    158      1.1  mrg   for (end = str + hash_len; str < end; str++)
    159      1.1  mrg     if (*str >= '0' && *str <= '9')
    160      1.1  mrg       seen[*str - '0'] = 1;
    161      1.1  mrg     else if (*str >= 'a' && *str <= 'f')
    162      1.1  mrg       seen[*str - 'a' + 10] = 1;
    163      1.1  mrg     else
    164      1.1  mrg       return 0;
    165      1.1  mrg 
    166      1.1  mrg   /* Count how many distinct digits seen */
    167      1.1  mrg   count = 0;
    168      1.1  mrg   for (i = 0; i < 16; i++)
    169      1.1  mrg     if (seen[i])
    170      1.1  mrg       count++;
    171      1.1  mrg 
    172      1.1  mrg   return count >= 5 && count <= 15;
    173      1.1  mrg }
    174      1.1  mrg 
    175      1.1  mrg static int
    176      1.1  mrg looks_like_rust (const char *str, size_t len)
    177      1.1  mrg {
    178      1.1  mrg   const char *end = str + len;
    179      1.1  mrg 
    180      1.1  mrg   while (str < end)
    181      1.1  mrg     switch (*str)
    182      1.1  mrg       {
    183      1.1  mrg       case '$':
    184      1.1  mrg 	if (!strncmp (str, "$C$", 3))
    185      1.1  mrg 	  str += 3;
    186      1.1  mrg 	else if (!strncmp (str, "$SP$", 4)
    187      1.1  mrg 		 || !strncmp (str, "$BP$", 4)
    188      1.1  mrg 		 || !strncmp (str, "$RF$", 4)
    189      1.1  mrg 		 || !strncmp (str, "$LT$", 4)
    190      1.1  mrg 		 || !strncmp (str, "$GT$", 4)
    191      1.1  mrg 		 || !strncmp (str, "$LP$", 4)
    192      1.1  mrg 		 || !strncmp (str, "$RP$", 4))
    193      1.1  mrg 	  str += 4;
    194      1.1  mrg 	else if (!strncmp (str, "$u20$", 5)
    195      1.1  mrg 		 || !strncmp (str, "$u22$", 5)
    196      1.1  mrg 		 || !strncmp (str, "$u27$", 5)
    197      1.1  mrg 		 || !strncmp (str, "$u2b$", 5)
    198      1.1  mrg 		 || !strncmp (str, "$u3b$", 5)
    199      1.1  mrg 		 || !strncmp (str, "$u5b$", 5)
    200      1.1  mrg 		 || !strncmp (str, "$u5d$", 5)
    201      1.1  mrg 		 || !strncmp (str, "$u7b$", 5)
    202      1.1  mrg 		 || !strncmp (str, "$u7d$", 5)
    203      1.1  mrg 		 || !strncmp (str, "$u7e$", 5))
    204      1.1  mrg 	  str += 5;
    205      1.1  mrg 	else
    206      1.1  mrg 	  return 0;
    207      1.1  mrg 	break;
    208      1.1  mrg       case '.':
    209      1.1  mrg 	/* Do not allow three or more consecutive dots */
    210      1.1  mrg 	if (!strncmp (str, "...", 3))
    211      1.1  mrg 	  return 0;
    212      1.1  mrg 	/* Fall through */
    213      1.1  mrg       case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    214      1.1  mrg       case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    215      1.1  mrg       case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    216      1.1  mrg       case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    217      1.1  mrg       case 'y': case 'z':
    218      1.1  mrg       case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    219      1.1  mrg       case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    220      1.1  mrg       case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    221      1.1  mrg       case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    222      1.1  mrg       case 'Y': case 'Z':
    223      1.1  mrg       case '0': case '1': case '2': case '3': case '4': case '5':
    224      1.1  mrg       case '6': case '7': case '8': case '9':
    225      1.1  mrg       case '_':
    226      1.1  mrg       case ':':
    227      1.1  mrg 	str++;
    228      1.1  mrg 	break;
    229      1.1  mrg       default:
    230      1.1  mrg 	return 0;
    231      1.1  mrg       }
    232      1.1  mrg 
    233      1.1  mrg   return 1;
    234      1.1  mrg }
    235      1.1  mrg 
    236      1.1  mrg /*
    237      1.1  mrg   INPUT: sym: symbol for which rust_is_mangled(sym) returned 1.
    238      1.1  mrg 
    239      1.1  mrg   The input is demangled in-place because the mangled name is always
    240      1.1  mrg   longer than the demangled one.  */
    241      1.1  mrg 
    242      1.1  mrg void
    243      1.1  mrg rust_demangle_sym (char *sym)
    244      1.1  mrg {
    245      1.1  mrg   const char *in;
    246      1.1  mrg   char *out;
    247      1.1  mrg   const char *end;
    248      1.1  mrg 
    249      1.1  mrg   if (!sym)
    250      1.1  mrg     return;
    251      1.1  mrg 
    252      1.1  mrg   in = sym;
    253      1.1  mrg   out = sym;
    254      1.1  mrg   end = sym + strlen (sym) - (hash_prefix_len + hash_len);
    255      1.1  mrg 
    256      1.1  mrg   while (in < end)
    257      1.1  mrg     switch (*in)
    258      1.1  mrg       {
    259      1.1  mrg       case '$':
    260      1.1  mrg 	if (!(unescape (&in, &out, "$C$", ',')
    261      1.1  mrg 	      || unescape (&in, &out, "$SP$", '@')
    262      1.1  mrg 	      || unescape (&in, &out, "$BP$", '*')
    263      1.1  mrg 	      || unescape (&in, &out, "$RF$", '&')
    264      1.1  mrg 	      || unescape (&in, &out, "$LT$", '<')
    265      1.1  mrg 	      || unescape (&in, &out, "$GT$", '>')
    266      1.1  mrg 	      || unescape (&in, &out, "$LP$", '(')
    267      1.1  mrg 	      || unescape (&in, &out, "$RP$", ')')
    268      1.1  mrg 	      || unescape (&in, &out, "$u20$", ' ')
    269      1.1  mrg 	      || unescape (&in, &out, "$u22$", '\"')
    270      1.1  mrg 	      || unescape (&in, &out, "$u27$", '\'')
    271      1.1  mrg 	      || unescape (&in, &out, "$u2b$", '+')
    272      1.1  mrg 	      || unescape (&in, &out, "$u3b$", ';')
    273      1.1  mrg 	      || unescape (&in, &out, "$u5b$", '[')
    274      1.1  mrg 	      || unescape (&in, &out, "$u5d$", ']')
    275      1.1  mrg 	      || unescape (&in, &out, "$u7b$", '{')
    276      1.1  mrg 	      || unescape (&in, &out, "$u7d$", '}')
    277      1.1  mrg 	      || unescape (&in, &out, "$u7e$", '~'))) {
    278      1.1  mrg 	  /* unexpected escape sequence, not looks_like_rust. */
    279      1.1  mrg 	  goto fail;
    280      1.1  mrg 	}
    281      1.1  mrg 	break;
    282      1.1  mrg       case '_':
    283      1.1  mrg 	/* If this is the start of a path component and the next
    284      1.1  mrg 	   character is an escape sequence, ignore the underscore. The
    285      1.1  mrg 	   mangler inserts an underscore to make sure the path
    286      1.1  mrg 	   component begins with a XID_Start character. */
    287      1.1  mrg 	if ((in == sym || in[-1] == ':') && in[1] == '$')
    288      1.1  mrg 	  in++;
    289      1.1  mrg 	else
    290      1.1  mrg 	  *out++ = *in++;
    291      1.1  mrg 	break;
    292      1.1  mrg       case '.':
    293      1.1  mrg 	if (in[1] == '.')
    294      1.1  mrg 	  {
    295      1.1  mrg 	    /* ".." becomes "::" */
    296      1.1  mrg 	    *out++ = ':';
    297      1.1  mrg 	    *out++ = ':';
    298      1.1  mrg 	    in += 2;
    299      1.1  mrg 	  }
    300      1.1  mrg 	else
    301      1.1  mrg 	  {
    302      1.1  mrg 	    /* "." becomes "-" */
    303      1.1  mrg 	    *out++ = '-';
    304      1.1  mrg 	    in++;
    305      1.1  mrg 	  }
    306      1.1  mrg 	break;
    307      1.1  mrg       case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    308      1.1  mrg       case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    309      1.1  mrg       case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    310      1.1  mrg       case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    311      1.1  mrg       case 'y': case 'z':
    312      1.1  mrg       case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    313      1.1  mrg       case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    314      1.1  mrg       case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    315      1.1  mrg       case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    316      1.1  mrg       case 'Y': case 'Z':
    317      1.1  mrg       case '0': case '1': case '2': case '3': case '4': case '5':
    318      1.1  mrg       case '6': case '7': case '8': case '9':
    319      1.1  mrg       case ':':
    320      1.1  mrg 	*out++ = *in++;
    321      1.1  mrg 	break;
    322      1.1  mrg       default:
    323      1.1  mrg 	/* unexpected character in symbol, not looks_like_rust.  */
    324      1.1  mrg 	goto fail;
    325      1.1  mrg       }
    326      1.1  mrg   goto done;
    327      1.1  mrg 
    328      1.1  mrg fail:
    329      1.1  mrg   *out++ = '?'; /* This is pretty lame, but it's hard to do better. */
    330      1.1  mrg done:
    331      1.1  mrg   *out = '\0';
    332      1.1  mrg }
    333      1.1  mrg 
    334      1.1  mrg static int
    335      1.1  mrg unescape (const char **in, char **out, const char *seq, char value)
    336      1.1  mrg {
    337      1.1  mrg   size_t len = strlen (seq);
    338      1.1  mrg 
    339      1.1  mrg   if (strncmp (*in, seq, len))
    340      1.1  mrg     return 0;
    341      1.1  mrg 
    342      1.1  mrg   **out = value;
    343      1.1  mrg 
    344      1.1  mrg   *in += len;
    345      1.1  mrg   *out += 1;
    346      1.1  mrg 
    347      1.1  mrg   return 1;
    348      1.1  mrg }
    349