Home | History | Annotate | Line # | Download | only in scripts
      1 #!/usr/bin/env python
      2 #===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
      3 #
      4 #                     The LLVM Compiler Infrastructure
      5 #
      6 # This file is distributed under the University of Illinois Open Source
      7 # License. See LICENSE.TXT for details.
      8 #
      9 #===------------------------------------------------------------------------===#
     10 import argparse
     11 import bisect
     12 import getopt
     13 import os
     14 import re
     15 import subprocess
     16 import sys
     17 
     18 symbolizers = {}
     19 DEBUG = False
     20 demangle = False
     21 binutils_prefix = None
     22 sysroot_path = None
     23 binary_name_filter = None
     24 fix_filename_patterns = None
     25 logfile = sys.stdin
     26 allow_system_symbolizer = True
     27 force_system_symbolizer = False
     28 
     29 # FIXME: merge the code that calls fix_filename().
     30 def fix_filename(file_name):
     31   if fix_filename_patterns:
     32     for path_to_cut in fix_filename_patterns:
     33       file_name = re.sub('.*' + path_to_cut, '', file_name)
     34   file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
     35   file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
     36   return file_name
     37 
     38 def sysroot_path_filter(binary_name):
     39   return sysroot_path + binary_name
     40 
     41 def is_valid_arch(s):
     42   return s in ["i386", "x86_64", "x86_64h", "arm", "armv6", "armv7", "armv7s",
     43                "armv7k", "arm64", "powerpc64", "powerpc64le", "s390x", "s390"]
     44 
     45 def guess_arch(addr):
     46   # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
     47   if len(addr) > 10:
     48     return 'x86_64'
     49   else:
     50     return 'i386'
     51 
     52 class Symbolizer(object):
     53   def __init__(self):
     54     pass
     55 
     56   def symbolize(self, addr, binary, offset):
     57     """Symbolize the given address (pair of binary and offset).
     58 
     59     Overriden in subclasses.
     60     Args:
     61         addr: virtual address of an instruction.
     62         binary: path to executable/shared object containing this instruction.
     63         offset: instruction offset in the @binary.
     64     Returns:
     65         list of strings (one string for each inlined frame) describing
     66         the code locations for this instruction (that is, function name, file
     67         name, line and column numbers).
     68     """
     69     return None
     70 
     71 
     72 class LLVMSymbolizer(Symbolizer):
     73   def __init__(self, symbolizer_path, default_arch, system, dsym_hints=[]):
     74     super(LLVMSymbolizer, self).__init__()
     75     self.symbolizer_path = symbolizer_path
     76     self.default_arch = default_arch
     77     self.system = system
     78     self.dsym_hints = dsym_hints
     79     self.pipe = self.open_llvm_symbolizer()
     80 
     81   def open_llvm_symbolizer(self):
     82     cmd = [self.symbolizer_path,
     83            '--use-symbol-table=true',
     84            '--demangle=%s' % demangle,
     85            '--functions=linkage',
     86            '--inlining=true',
     87            '--default-arch=%s' % self.default_arch]
     88     if self.system == 'Darwin':
     89       for hint in self.dsym_hints:
     90         cmd.append('--dsym-hint=%s' % hint)
     91     if DEBUG:
     92       print(' '.join(cmd))
     93     try:
     94       result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
     95                                 stdout=subprocess.PIPE,
     96                                 bufsize=0,
     97                                 universal_newlines=True)
     98     except OSError:
     99       result = None
    100     return result
    101 
    102   def symbolize(self, addr, binary, offset):
    103     """Overrides Symbolizer.symbolize."""
    104     if not self.pipe:
    105       return None
    106     result = []
    107     try:
    108       symbolizer_input = '"%s" %s' % (binary, offset)
    109       if DEBUG:
    110         print(symbolizer_input)
    111       self.pipe.stdin.write("%s\n" % symbolizer_input)
    112       while True:
    113         function_name = self.pipe.stdout.readline().rstrip()
    114         if not function_name:
    115           break
    116         file_name = self.pipe.stdout.readline().rstrip()
    117         file_name = fix_filename(file_name)
    118         if (not function_name.startswith('??') or
    119             not file_name.startswith('??')):
    120           # Append only non-trivial frames.
    121           result.append('%s in %s %s' % (addr, function_name,
    122                                          file_name))
    123     except Exception:
    124       result = []
    125     if not result:
    126       result = None
    127     return result
    128 
    129 
    130 def LLVMSymbolizerFactory(system, default_arch, dsym_hints=[]):
    131   symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
    132   if not symbolizer_path:
    133     symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
    134     if not symbolizer_path:
    135       # Assume llvm-symbolizer is in PATH.
    136       symbolizer_path = 'llvm-symbolizer'
    137   return LLVMSymbolizer(symbolizer_path, default_arch, system, dsym_hints)
    138 
    139 
    140 class Addr2LineSymbolizer(Symbolizer):
    141   def __init__(self, binary):
    142     super(Addr2LineSymbolizer, self).__init__()
    143     self.binary = binary
    144     self.pipe = self.open_addr2line()
    145     self.output_terminator = -1
    146 
    147   def open_addr2line(self):
    148     addr2line_tool = 'addr2line'
    149     if binutils_prefix:
    150       addr2line_tool = binutils_prefix + addr2line_tool
    151     cmd = [addr2line_tool, '-fi']
    152     if demangle:
    153       cmd += ['--demangle']
    154     cmd += ['-e', self.binary]
    155     if DEBUG:
    156       print(' '.join(cmd))
    157     return subprocess.Popen(cmd,
    158                             stdin=subprocess.PIPE, stdout=subprocess.PIPE,
    159                             bufsize=0,
    160                             universal_newlines=True)
    161 
    162   def symbolize(self, addr, binary, offset):
    163     """Overrides Symbolizer.symbolize."""
    164     if self.binary != binary:
    165       return None
    166     lines = []
    167     try:
    168       self.pipe.stdin.write("%s\n" % offset)
    169       self.pipe.stdin.write("%s\n" % self.output_terminator)
    170       is_first_frame = True
    171       while True:
    172         function_name = self.pipe.stdout.readline().rstrip()
    173         file_name = self.pipe.stdout.readline().rstrip()
    174         if is_first_frame:
    175           is_first_frame = False
    176         elif function_name in ['', '??']:
    177           assert file_name == function_name
    178           break
    179         lines.append((function_name, file_name));
    180     except Exception:
    181       lines.append(('??', '??:0'))
    182     return ['%s in %s %s' % (addr, function, fix_filename(file)) for (function, file) in lines]
    183 
    184 class UnbufferedLineConverter(object):
    185   """
    186   Wrap a child process that responds to each line of input with one line of
    187   output.  Uses pty to trick the child into providing unbuffered output.
    188   """
    189   def __init__(self, args, close_stderr=False):
    190     # Local imports so that the script can start on Windows.
    191     import pty
    192     import termios
    193     pid, fd = pty.fork()
    194     if pid == 0:
    195       # We're the child. Transfer control to command.
    196       if close_stderr:
    197         dev_null = os.open('/dev/null', 0)
    198         os.dup2(dev_null, 2)
    199       os.execvp(args[0], args)
    200     else:
    201       # Disable echoing.
    202       attr = termios.tcgetattr(fd)
    203       attr[3] = attr[3] & ~termios.ECHO
    204       termios.tcsetattr(fd, termios.TCSANOW, attr)
    205       # Set up a file()-like interface to the child process
    206       self.r = os.fdopen(fd, "r", 1)
    207       self.w = os.fdopen(os.dup(fd), "w", 1)
    208 
    209   def convert(self, line):
    210     self.w.write(line + "\n")
    211     return self.readline()
    212 
    213   def readline(self):
    214     return self.r.readline().rstrip()
    215 
    216 
    217 class DarwinSymbolizer(Symbolizer):
    218   def __init__(self, addr, binary, arch):
    219     super(DarwinSymbolizer, self).__init__()
    220     self.binary = binary
    221     self.arch = arch
    222     self.open_atos()
    223 
    224   def open_atos(self):
    225     if DEBUG:
    226       print('atos -o %s -arch %s' % (self.binary, self.arch))
    227     cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
    228     self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
    229 
    230   def symbolize(self, addr, binary, offset):
    231     """Overrides Symbolizer.symbolize."""
    232     if self.binary != binary:
    233       return None
    234     if not os.path.exists(binary):
    235       # If the binary doesn't exist atos will exit which will lead to IOError
    236       # exceptions being raised later on so just don't try to symbolize.
    237       return ['{} ({}:{}+{})'.format(addr, binary, self.arch, offset)]
    238     atos_line = self.atos.convert('0x%x' % int(offset, 16))
    239     while "got symbolicator for" in atos_line:
    240       atos_line = self.atos.readline()
    241     # A well-formed atos response looks like this:
    242     #   foo(type1, type2) (in object.name) (filename.cc:80)
    243     match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
    244     if DEBUG:
    245       print('atos_line: ', atos_line)
    246     if match:
    247       function_name = match.group(1)
    248       function_name = re.sub('\(.*?\)', '', function_name)
    249       file_name = fix_filename(match.group(3))
    250       return ['%s in %s %s' % (addr, function_name, file_name)]
    251     else:
    252       return ['%s in %s' % (addr, atos_line)]
    253 
    254 
    255 # Chain several symbolizers so that if one symbolizer fails, we fall back
    256 # to the next symbolizer in chain.
    257 class ChainSymbolizer(Symbolizer):
    258   def __init__(self, symbolizer_list):
    259     super(ChainSymbolizer, self).__init__()
    260     self.symbolizer_list = symbolizer_list
    261 
    262   def symbolize(self, addr, binary, offset):
    263     """Overrides Symbolizer.symbolize."""
    264     for symbolizer in self.symbolizer_list:
    265       if symbolizer:
    266         result = symbolizer.symbolize(addr, binary, offset)
    267         if result:
    268           return result
    269     return None
    270 
    271   def append_symbolizer(self, symbolizer):
    272     self.symbolizer_list.append(symbolizer)
    273 
    274 
    275 def BreakpadSymbolizerFactory(binary):
    276   suffix = os.getenv('BREAKPAD_SUFFIX')
    277   if suffix:
    278     filename = binary + suffix
    279     if os.access(filename, os.F_OK):
    280       return BreakpadSymbolizer(filename)
    281   return None
    282 
    283 
    284 def SystemSymbolizerFactory(system, addr, binary, arch):
    285   if system == 'Darwin':
    286     return DarwinSymbolizer(addr, binary, arch)
    287   elif system in ['Linux', 'FreeBSD', 'NetBSD', 'SunOS']:
    288     return Addr2LineSymbolizer(binary)
    289 
    290 
    291 class BreakpadSymbolizer(Symbolizer):
    292   def __init__(self, filename):
    293     super(BreakpadSymbolizer, self).__init__()
    294     self.filename = filename
    295     lines = file(filename).readlines()
    296     self.files = []
    297     self.symbols = {}
    298     self.address_list = []
    299     self.addresses = {}
    300     # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
    301     fragments = lines[0].rstrip().split()
    302     self.arch = fragments[2]
    303     self.debug_id = fragments[3]
    304     self.binary = ' '.join(fragments[4:])
    305     self.parse_lines(lines[1:])
    306 
    307   def parse_lines(self, lines):
    308     cur_function_addr = ''
    309     for line in lines:
    310       fragments = line.split()
    311       if fragments[0] == 'FILE':
    312         assert int(fragments[1]) == len(self.files)
    313         self.files.append(' '.join(fragments[2:]))
    314       elif fragments[0] == 'PUBLIC':
    315         self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
    316       elif fragments[0] in ['CFI', 'STACK']:
    317         pass
    318       elif fragments[0] == 'FUNC':
    319         cur_function_addr = int(fragments[1], 16)
    320         if not cur_function_addr in self.symbols.keys():
    321           self.symbols[cur_function_addr] = ' '.join(fragments[4:])
    322       else:
    323         # Line starting with an address.
    324         addr = int(fragments[0], 16)
    325         self.address_list.append(addr)
    326         # Tuple of symbol address, size, line, file number.
    327         self.addresses[addr] = (cur_function_addr,
    328                                 int(fragments[1], 16),
    329                                 int(fragments[2]),
    330                                 int(fragments[3]))
    331     self.address_list.sort()
    332 
    333   def get_sym_file_line(self, addr):
    334     key = None
    335     if addr in self.addresses.keys():
    336       key = addr
    337     else:
    338       index = bisect.bisect_left(self.address_list, addr)
    339       if index == 0:
    340         return None
    341       else:
    342         key = self.address_list[index - 1]
    343     sym_id, size, line_no, file_no = self.addresses[key]
    344     symbol = self.symbols[sym_id]
    345     filename = self.files[file_no]
    346     if addr < key + size:
    347       return symbol, filename, line_no
    348     else:
    349       return None
    350 
    351   def symbolize(self, addr, binary, offset):
    352     if self.binary != binary:
    353       return None
    354     res = self.get_sym_file_line(int(offset, 16))
    355     if res:
    356       function_name, file_name, line_no = res
    357       result = ['%s in %s %s:%d' % (
    358           addr, function_name, file_name, line_no)]
    359       print(result)
    360       return result
    361     else:
    362       return None
    363 
    364 
    365 class SymbolizationLoop(object):
    366   def __init__(self, binary_name_filter=None, dsym_hint_producer=None):
    367     if sys.platform == 'win32':
    368       # ASan on Windows uses dbghelp.dll to symbolize in-process, which works
    369       # even in sandboxed processes.  Nothing needs to be done here.
    370       self.process_line = self.process_line_echo
    371     else:
    372       # Used by clients who may want to supply a different binary name.
    373       # E.g. in Chrome several binaries may share a single .dSYM.
    374       self.binary_name_filter = binary_name_filter
    375       self.dsym_hint_producer = dsym_hint_producer
    376       self.system = os.uname()[0]
    377       if self.system not in ['Linux', 'Darwin', 'FreeBSD', 'NetBSD','SunOS']:
    378         raise Exception('Unknown system')
    379       self.llvm_symbolizers = {}
    380       self.last_llvm_symbolizer = None
    381       self.dsym_hints = set([])
    382       self.frame_no = 0
    383       self.process_line = self.process_line_posix
    384 
    385   def symbolize_address(self, addr, binary, offset, arch):
    386     # On non-Darwin (i.e. on platforms without .dSYM debug info) always use
    387     # a single symbolizer binary.
    388     # On Darwin, if the dsym hint producer is present:
    389     #  1. check whether we've seen this binary already; if so,
    390     #     use |llvm_symbolizers[binary]|, which has already loaded the debug
    391     #     info for this binary (might not be the case for
    392     #     |last_llvm_symbolizer|);
    393     #  2. otherwise check if we've seen all the hints for this binary already;
    394     #     if so, reuse |last_llvm_symbolizer| which has the full set of hints;
    395     #  3. otherwise create a new symbolizer and pass all currently known
    396     #     .dSYM hints to it.
    397     result = None
    398     if not force_system_symbolizer:
    399       if not binary in self.llvm_symbolizers:
    400         use_new_symbolizer = True
    401         if self.system == 'Darwin' and self.dsym_hint_producer:
    402           dsym_hints_for_binary = set(self.dsym_hint_producer(binary))
    403           use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
    404           self.dsym_hints |= dsym_hints_for_binary
    405         if self.last_llvm_symbolizer and not use_new_symbolizer:
    406             self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
    407         else:
    408           self.last_llvm_symbolizer = LLVMSymbolizerFactory(
    409               self.system, arch, self.dsym_hints)
    410           self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
    411       # Use the chain of symbolizers:
    412       # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
    413       # (fall back to next symbolizer if the previous one fails).
    414       if not binary in symbolizers:
    415         symbolizers[binary] = ChainSymbolizer(
    416             [BreakpadSymbolizerFactory(binary), self.llvm_symbolizers[binary]])
    417       result = symbolizers[binary].symbolize(addr, binary, offset)
    418     else:
    419       symbolizers[binary] = ChainSymbolizer([])
    420     if result is None:
    421       if not allow_system_symbolizer:
    422         raise Exception('Failed to launch or use llvm-symbolizer.')
    423       # Initialize system symbolizer only if other symbolizers failed.
    424       symbolizers[binary].append_symbolizer(
    425           SystemSymbolizerFactory(self.system, addr, binary, arch))
    426       result = symbolizers[binary].symbolize(addr, binary, offset)
    427     # The system symbolizer must produce some result.
    428     assert result
    429     return result
    430 
    431   def get_symbolized_lines(self, symbolized_lines):
    432     if not symbolized_lines:
    433       return [self.current_line]
    434     else:
    435       result = []
    436       for symbolized_frame in symbolized_lines:
    437         result.append('    #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
    438         self.frame_no += 1
    439       return result
    440 
    441   def process_logfile(self):
    442     self.frame_no = 0
    443     for line in logfile:
    444       processed = self.process_line(line)
    445       print('\n'.join(processed))
    446 
    447   def process_line_echo(self, line):
    448     return [line.rstrip()]
    449 
    450   def process_line_posix(self, line):
    451     self.current_line = line.rstrip()
    452     #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
    453     stack_trace_line_format = (
    454         '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
    455     match = re.match(stack_trace_line_format, line)
    456     if not match:
    457       return [self.current_line]
    458     if DEBUG:
    459       print(line)
    460     _, frameno_str, addr, binary, offset = match.groups()
    461     arch = ""
    462     # Arch can be embedded in the filename, e.g.: "libabc.dylib:x86_64h"
    463     colon_pos = binary.rfind(":")
    464     if colon_pos != -1:
    465       maybe_arch = binary[colon_pos+1:]
    466       if is_valid_arch(maybe_arch):
    467         arch = maybe_arch
    468         binary = binary[0:colon_pos]
    469     if arch == "":
    470       arch = guess_arch(addr)
    471     if frameno_str == '0':
    472       # Assume that frame #0 is the first frame of new stack trace.
    473       self.frame_no = 0
    474     original_binary = binary
    475     if self.binary_name_filter:
    476       binary = self.binary_name_filter(binary)
    477     symbolized_line = self.symbolize_address(addr, binary, offset, arch)
    478     if not symbolized_line:
    479       if original_binary != binary:
    480         symbolized_line = self.symbolize_address(addr, original_binary, offset, arch)
    481     return self.get_symbolized_lines(symbolized_line)
    482 
    483 
    484 if __name__ == '__main__':
    485   parser = argparse.ArgumentParser(
    486       formatter_class=argparse.RawDescriptionHelpFormatter,
    487       description='ASan symbolization script',
    488       epilog='Example of use:\n'
    489              'asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" '
    490              '-s "$HOME/SymbolFiles" < asan.log')
    491   parser.add_argument('path_to_cut', nargs='*',
    492                       help='pattern to be cut from the result file path ')
    493   parser.add_argument('-d','--demangle', action='store_true',
    494                       help='demangle function names')
    495   parser.add_argument('-s', metavar='SYSROOT',
    496                       help='set path to sysroot for sanitized binaries')
    497   parser.add_argument('-c', metavar='CROSS_COMPILE',
    498                       help='set prefix for binutils')
    499   parser.add_argument('-l','--logfile', default=sys.stdin,
    500                       type=argparse.FileType('r'),
    501                       help='set log file name to parse, default is stdin')
    502   parser.add_argument('--force-system-symbolizer', action='store_true',
    503                       help='don\'t use llvm-symbolizer')
    504   args = parser.parse_args()
    505   if args.path_to_cut:
    506     fix_filename_patterns = args.path_to_cut
    507   if args.demangle:
    508     demangle = True
    509   if args.s:
    510     binary_name_filter = sysroot_path_filter
    511     sysroot_path = args.s
    512   if args.c:
    513     binutils_prefix = args.c
    514   if args.logfile:
    515     logfile = args.logfile
    516   else:
    517     logfile = sys.stdin
    518   if args.force_system_symbolizer:
    519     force_system_symbolizer = True
    520   if force_system_symbolizer:
    521     assert(allow_system_symbolizer)
    522   loop = SymbolizationLoop(binary_name_filter)
    523   loop.process_logfile()
    524