Home | History | Annotate | Line # | Download | only in xray
      1 //===-- xray_profile_collector.cc ------------------------------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file is a part of XRay, a dynamic runtime instrumentation system.
     11 //
     12 // This implements the interface for the profileCollectorService.
     13 //
     14 //===----------------------------------------------------------------------===//
     15 #include "xray_profile_collector.h"
     16 #include "sanitizer_common/sanitizer_common.h"
     17 #include "xray_allocator.h"
     18 #include "xray_defs.h"
     19 #include "xray_profiling_flags.h"
     20 #include "xray_segmented_array.h"
     21 #include <memory>
     22 #include <pthread.h>
     23 #include <utility>
     24 
     25 namespace __xray {
     26 namespace profileCollectorService {
     27 
     28 namespace {
     29 
     30 SpinMutex GlobalMutex;
     31 struct ThreadTrie {
     32   tid_t TId;
     33   typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
     34 };
     35 
     36 struct ProfileBuffer {
     37   void *Data;
     38   size_t Size;
     39 };
     40 
     41 // Current version of the profile format.
     42 constexpr u64 XRayProfilingVersion = 0x20180424;
     43 
     44 // Identifier for XRay profiling files 'xrayprof' in hex.
     45 constexpr u64 XRayMagicBytes = 0x7872617970726f66;
     46 
     47 struct XRayProfilingFileHeader {
     48   const u64 MagicBytes = XRayMagicBytes;
     49   const u64 Version = XRayProfilingVersion;
     50   u64 Timestamp = 0; // System time in nanoseconds.
     51   u64 PID = 0;       // Process ID.
     52 };
     53 
     54 struct BlockHeader {
     55   u32 BlockSize;
     56   u32 BlockNum;
     57   u64 ThreadId;
     58 };
     59 
     60 struct ThreadData {
     61   BufferQueue *BQ;
     62   FunctionCallTrie::Allocators::Buffers Buffers;
     63   FunctionCallTrie::Allocators Allocators;
     64   FunctionCallTrie FCT;
     65   tid_t TId;
     66 };
     67 
     68 using ThreadDataArray = Array<ThreadData>;
     69 using ThreadDataAllocator = ThreadDataArray::AllocatorType;
     70 
     71 // We use a separate buffer queue for the backing store for the allocator used
     72 // by the ThreadData array. This lets us host the buffers, allocators, and tries
     73 // associated with a thread by moving the data into the array instead of
     74 // attempting to copy the data to a separately backed set of tries.
     75 static typename std::aligned_storage<
     76     sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
     77 static BufferQueue *BQ = nullptr;
     78 static BufferQueue::Buffer Buffer;
     79 static typename std::aligned_storage<sizeof(ThreadDataAllocator),
     80                                      alignof(ThreadDataAllocator)>::type
     81     ThreadDataAllocatorStorage;
     82 static typename std::aligned_storage<sizeof(ThreadDataArray),
     83                                      alignof(ThreadDataArray)>::type
     84     ThreadDataArrayStorage;
     85 
     86 static ThreadDataAllocator *TDAllocator = nullptr;
     87 static ThreadDataArray *TDArray = nullptr;
     88 
     89 using ProfileBufferArray = Array<ProfileBuffer>;
     90 using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
     91 
     92 // These need to be global aligned storage to avoid dynamic initialization. We
     93 // need these to be aligned to allow us to placement new objects into the
     94 // storage, and have pointers to those objects be appropriately aligned.
     95 static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
     96     ProfileBuffersStorage;
     97 static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
     98     ProfileBufferArrayAllocatorStorage;
     99 
    100 static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
    101 static ProfileBufferArray *ProfileBuffers = nullptr;
    102 
    103 // Use a global flag to determine whether the collector implementation has been
    104 // initialized.
    105 static atomic_uint8_t CollectorInitialized{0};
    106 
    107 } // namespace
    108 
    109 void post(BufferQueue *Q, FunctionCallTrie &&T,
    110           FunctionCallTrie::Allocators &&A,
    111           FunctionCallTrie::Allocators::Buffers &&B,
    112           tid_t TId) XRAY_NEVER_INSTRUMENT {
    113   DCHECK_NE(Q, nullptr);
    114 
    115   // Bail out early if the collector has not been initialized.
    116   if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
    117     T.~FunctionCallTrie();
    118     A.~Allocators();
    119     Q->releaseBuffer(B.NodeBuffer);
    120     Q->releaseBuffer(B.RootsBuffer);
    121     Q->releaseBuffer(B.ShadowStackBuffer);
    122     Q->releaseBuffer(B.NodeIdPairBuffer);
    123     B.~Buffers();
    124     return;
    125   }
    126 
    127   {
    128     SpinMutexLock Lock(&GlobalMutex);
    129     DCHECK_NE(TDAllocator, nullptr);
    130     DCHECK_NE(TDArray, nullptr);
    131 
    132     if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
    133                                TId) == nullptr) {
    134       // If we fail to add the data to the array, we should destroy the objects
    135       // handed us.
    136       T.~FunctionCallTrie();
    137       A.~Allocators();
    138       Q->releaseBuffer(B.NodeBuffer);
    139       Q->releaseBuffer(B.RootsBuffer);
    140       Q->releaseBuffer(B.ShadowStackBuffer);
    141       Q->releaseBuffer(B.NodeIdPairBuffer);
    142       B.~Buffers();
    143     }
    144   }
    145 }
    146 
    147 // A PathArray represents the function id's representing a stack trace. In this
    148 // context a path is almost always represented from the leaf function in a call
    149 // stack to a root of the call trie.
    150 using PathArray = Array<int32_t>;
    151 
    152 struct ProfileRecord {
    153   using PathAllocator = typename PathArray::AllocatorType;
    154 
    155   // The Path in this record is the function id's from the leaf to the root of
    156   // the function call stack as represented from a FunctionCallTrie.
    157   PathArray Path;
    158   const FunctionCallTrie::Node *Node;
    159 };
    160 
    161 namespace {
    162 
    163 using ProfileRecordArray = Array<ProfileRecord>;
    164 
    165 // Walk a depth-first traversal of each root of the FunctionCallTrie to generate
    166 // the path(s) and the data associated with the path.
    167 static void
    168 populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
    169                 const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
    170   using StackArray = Array<const FunctionCallTrie::Node *>;
    171   using StackAllocator = typename StackArray::AllocatorType;
    172   StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
    173   StackArray DFSStack(StackAlloc);
    174   for (const auto *R : Trie.getRoots()) {
    175     DFSStack.Append(R);
    176     while (!DFSStack.empty()) {
    177       auto *Node = DFSStack.back();
    178       DFSStack.trim(1);
    179       if (Node == nullptr)
    180         continue;
    181       auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
    182       if (Record == nullptr)
    183         return;
    184       DCHECK_NE(Record, nullptr);
    185 
    186       // Traverse the Node's parents and as we're doing so, get the FIds in
    187       // the order they appear.
    188       for (auto N = Node; N != nullptr; N = N->Parent)
    189         Record->Path.Append(N->FId);
    190       DCHECK(!Record->Path.empty());
    191 
    192       for (const auto C : Node->Callees)
    193         DFSStack.Append(C.NodePtr);
    194     }
    195   }
    196 }
    197 
    198 static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
    199                              const ProfileRecordArray &ProfileRecords)
    200     XRAY_NEVER_INSTRUMENT {
    201   auto NextPtr = static_cast<uint8_t *>(
    202                      internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
    203                  sizeof(Header);
    204   for (const auto &Record : ProfileRecords) {
    205     // List of IDs follow:
    206     for (const auto FId : Record.Path)
    207       NextPtr =
    208           static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
    209           sizeof(FId);
    210 
    211     // Add the sentinel here.
    212     constexpr int32_t SentinelFId = 0;
    213     NextPtr = static_cast<uint8_t *>(
    214                   internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
    215               sizeof(SentinelFId);
    216 
    217     // Add the node data here.
    218     NextPtr =
    219         static_cast<uint8_t *>(internal_memcpy(
    220             NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
    221         sizeof(Record.Node->CallCount);
    222     NextPtr = static_cast<uint8_t *>(
    223                   internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
    224                                   sizeof(Record.Node->CumulativeLocalTime))) +
    225               sizeof(Record.Node->CumulativeLocalTime);
    226   }
    227 
    228   DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
    229 }
    230 
    231 } // namespace
    232 
    233 void serialize() XRAY_NEVER_INSTRUMENT {
    234   if (!atomic_load(&CollectorInitialized, memory_order_acquire))
    235     return;
    236 
    237   SpinMutexLock Lock(&GlobalMutex);
    238 
    239   // Clear out the global ProfileBuffers, if it's not empty.
    240   for (auto &B : *ProfileBuffers)
    241     deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
    242   ProfileBuffers->trim(ProfileBuffers->size());
    243 
    244   DCHECK_NE(TDArray, nullptr);
    245   if (TDArray->empty())
    246     return;
    247 
    248   // Then repopulate the global ProfileBuffers.
    249   u32 I = 0;
    250   auto MaxSize = profilingFlags()->global_allocator_max;
    251   auto ProfileArena = allocateBuffer(MaxSize);
    252   if (ProfileArena == nullptr)
    253     return;
    254 
    255   auto ProfileArenaCleanup = at_scope_exit(
    256       [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
    257 
    258   auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
    259   if (PathArena == nullptr)
    260     return;
    261 
    262   auto PathArenaCleanup = at_scope_exit(
    263       [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
    264 
    265   for (const auto &ThreadTrie : *TDArray) {
    266     using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
    267     ProfileRecordAllocator PRAlloc(ProfileArena,
    268                                    profilingFlags()->global_allocator_max);
    269     ProfileRecord::PathAllocator PathAlloc(
    270         PathArena, profilingFlags()->global_allocator_max);
    271     ProfileRecordArray ProfileRecords(PRAlloc);
    272 
    273     // First, we want to compute the amount of space we're going to need. We'll
    274     // use a local allocator and an __xray::Array<...> to store the intermediary
    275     // data, then compute the size as we're going along. Then we'll allocate the
    276     // contiguous space to contain the thread buffer data.
    277     if (ThreadTrie.FCT.getRoots().empty())
    278       continue;
    279 
    280     populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
    281     DCHECK(!ThreadTrie.FCT.getRoots().empty());
    282     DCHECK(!ProfileRecords.empty());
    283 
    284     // Go through each record, to compute the sizes.
    285     //
    286     // header size = block size (4 bytes)
    287     //   + block number (4 bytes)
    288     //   + thread id (8 bytes)
    289     // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
    290     //   + call count (8 bytes)
    291     //   + local time (8 bytes)
    292     //   + end of record (8 bytes)
    293     u32 CumulativeSizes = 0;
    294     for (const auto &Record : ProfileRecords)
    295       CumulativeSizes += 20 + (4 * Record.Path.size());
    296 
    297     BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
    298     auto B = ProfileBuffers->Append({});
    299     B->Size = sizeof(Header) + CumulativeSizes;
    300     B->Data = allocateBuffer(B->Size);
    301     DCHECK_NE(B->Data, nullptr);
    302     serializeRecords(B, Header, ProfileRecords);
    303   }
    304 }
    305 
    306 void reset() XRAY_NEVER_INSTRUMENT {
    307   atomic_store(&CollectorInitialized, 0, memory_order_release);
    308   SpinMutexLock Lock(&GlobalMutex);
    309 
    310   if (ProfileBuffers != nullptr) {
    311     // Clear out the profile buffers that have been serialized.
    312     for (auto &B : *ProfileBuffers)
    313       deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
    314     ProfileBuffers->trim(ProfileBuffers->size());
    315     ProfileBuffers = nullptr;
    316   }
    317 
    318   if (TDArray != nullptr) {
    319     // Release the resources as required.
    320     for (auto &TD : *TDArray) {
    321       TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
    322       TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
    323       TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
    324       TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
    325     }
    326     // We don't bother destroying the array here because we've already
    327     // potentially freed the backing store for the array. Instead we're going to
    328     // reset the pointer to nullptr, and re-use the storage later instead
    329     // (placement-new'ing into the storage as-is).
    330     TDArray = nullptr;
    331   }
    332 
    333   if (TDAllocator != nullptr) {
    334     TDAllocator->~Allocator();
    335     TDAllocator = nullptr;
    336   }
    337 
    338   if (Buffer.Data != nullptr) {
    339     BQ->releaseBuffer(Buffer);
    340   }
    341 
    342   if (BQ == nullptr) {
    343     bool Success = false;
    344     new (&BufferQueueStorage)
    345         BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
    346     if (!Success)
    347       return;
    348     BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
    349   } else {
    350     BQ->finalize();
    351 
    352     if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
    353         BufferQueue::ErrorCode::Ok)
    354       return;
    355   }
    356 
    357   if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
    358     return;
    359 
    360   new (&ProfileBufferArrayAllocatorStorage)
    361       ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
    362   ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
    363       &ProfileBufferArrayAllocatorStorage);
    364 
    365   new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
    366   ProfileBuffers =
    367       reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
    368 
    369   new (&ThreadDataAllocatorStorage)
    370       ThreadDataAllocator(Buffer.Data, Buffer.Size);
    371   TDAllocator =
    372       reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
    373   new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
    374   TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
    375 
    376   atomic_store(&CollectorInitialized, 1, memory_order_release);
    377 }
    378 
    379 XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
    380   SpinMutexLock Lock(&GlobalMutex);
    381 
    382   if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
    383     return {nullptr, 0};
    384 
    385   static pthread_once_t Once = PTHREAD_ONCE_INIT;
    386   static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
    387       FileHeaderStorage;
    388   pthread_once(
    389       &Once, +[]() XRAY_NEVER_INSTRUMENT {
    390         new (&FileHeaderStorage) XRayProfilingFileHeader{};
    391       });
    392 
    393   if (UNLIKELY(B.Data == nullptr)) {
    394     // The first buffer should always contain the file header information.
    395     auto &FileHeader =
    396         *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
    397     FileHeader.Timestamp = NanoTime();
    398     FileHeader.PID = internal_getpid();
    399     return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
    400   }
    401 
    402   if (UNLIKELY(B.Data == &FileHeaderStorage))
    403     return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
    404 
    405   BlockHeader Header;
    406   internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
    407   auto NextBlock = Header.BlockNum + 1;
    408   if (NextBlock < ProfileBuffers->size())
    409     return {(*ProfileBuffers)[NextBlock].Data,
    410             (*ProfileBuffers)[NextBlock].Size};
    411   return {nullptr, 0};
    412 }
    413 
    414 } // namespace profileCollectorService
    415 } // namespace __xray
    416