Home | History | Annotate | Line # | Download | only in xray
      1 //===-- xray_function_call_trie.h ------------------------------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file is a part of XRay, a dynamic runtime instrumentation system.
     11 //
     12 // This file defines the interface for a function call trie.
     13 //
     14 //===----------------------------------------------------------------------===//
     15 #ifndef XRAY_FUNCTION_CALL_TRIE_H
     16 #define XRAY_FUNCTION_CALL_TRIE_H
     17 
     18 #include "xray_buffer_queue.h"
     19 #include "xray_defs.h"
     20 #include "xray_profiling_flags.h"
     21 #include "xray_segmented_array.h"
     22 #include <limits>
     23 #include <memory> // For placement new.
     24 #include <utility>
     25 
     26 namespace __xray {
     27 
     28 /// A FunctionCallTrie represents the stack traces of XRay instrumented
     29 /// functions that we've encountered, where a node corresponds to a function and
     30 /// the path from the root to the node its stack trace. Each node in the trie
     31 /// will contain some useful values, including:
     32 ///
     33 ///   * The cumulative amount of time spent in this particular node/stack.
     34 ///   * The number of times this stack has appeared.
     35 ///   * A histogram of latencies for that particular node.
     36 ///
     37 /// Each node in the trie will also contain a list of callees, represented using
     38 /// a Array<NodeIdPair> -- each NodeIdPair instance will contain the function
     39 /// ID of the callee, and a pointer to the node.
     40 ///
     41 /// If we visualise this data structure, we'll find the following potential
     42 /// representation:
     43 ///
     44 ///   [function id node] -> [callees] [cumulative time]
     45 ///                         [call counter] [latency histogram]
     46 ///
     47 /// As an example, when we have a function in this pseudocode:
     48 ///
     49 ///   func f(N) {
     50 ///     g()
     51 ///     h()
     52 ///     for i := 1..N { j() }
     53 ///   }
     54 ///
     55 /// We may end up with a trie of the following form:
     56 ///
     57 ///   f -> [ g, h, j ] [...] [1] [...]
     58 ///   g -> [ ... ] [...] [1] [...]
     59 ///   h -> [ ... ] [...] [1] [...]
     60 ///   j -> [ ... ] [...] [N] [...]
     61 ///
     62 /// If for instance the function g() called j() like so:
     63 ///
     64 ///   func g() {
     65 ///     for i := 1..10 { j() }
     66 ///   }
     67 ///
     68 /// We'll find the following updated trie:
     69 ///
     70 ///   f -> [ g, h, j ] [...] [1] [...]
     71 ///   g -> [ j' ] [...] [1] [...]
     72 ///   h -> [ ... ] [...] [1] [...]
     73 ///   j -> [ ... ] [...] [N] [...]
     74 ///   j' -> [ ... ] [...] [10] [...]
     75 ///
     76 /// Note that we'll have a new node representing the path `f -> g -> j'` with
     77 /// isolated data. This isolation gives us a means of representing the stack
     78 /// traces as a path, as opposed to a key in a table. The alternative
     79 /// implementation here would be to use a separate table for the path, and use
     80 /// hashes of the path as an identifier to accumulate the information. We've
     81 /// moved away from this approach as it takes a lot of time to compute the hash
     82 /// every time we need to update a function's call information as we're handling
     83 /// the entry and exit events.
     84 ///
     85 /// This approach allows us to maintain a shadow stack, which represents the
     86 /// currently executing path, and on function exits quickly compute the amount
     87 /// of time elapsed from the entry, then update the counters for the node
     88 /// already represented in the trie. This necessitates an efficient
     89 /// representation of the various data structures (the list of callees must be
     90 /// cache-aware and efficient to look up, and the histogram must be compact and
     91 /// quick to update) to enable us to keep the overheads of this implementation
     92 /// to the minimum.
     93 class FunctionCallTrie {
     94 public:
     95   struct Node;
     96 
     97   // We use a NodeIdPair type instead of a std::pair<...> to not rely on the
     98   // standard library types in this header.
     99   struct NodeIdPair {
    100     Node *NodePtr;
    101     int32_t FId;
    102   };
    103 
    104   using NodeIdPairArray = Array<NodeIdPair>;
    105   using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;
    106 
    107   // A Node in the FunctionCallTrie gives us a list of callees, the cumulative
    108   // number of times this node actually appeared, the cumulative amount of time
    109   // for this particular node including its children call times, and just the
    110   // local time spent on this node. Each Node will have the ID of the XRay
    111   // instrumented function that it is associated to.
    112   struct Node {
    113     Node *Parent;
    114     NodeIdPairArray Callees;
    115     uint64_t CallCount;
    116     uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
    117     int32_t FId;
    118 
    119     // TODO: Include the compact histogram.
    120   };
    121 
    122 private:
    123   struct ShadowStackEntry {
    124     uint64_t EntryTSC;
    125     Node *NodePtr;
    126     uint16_t EntryCPU;
    127   };
    128 
    129   using NodeArray = Array<Node>;
    130   using RootArray = Array<Node *>;
    131   using ShadowStackArray = Array<ShadowStackEntry>;
    132 
    133 public:
    134   // We collate the allocators we need into a single struct, as a convenience to
    135   // allow us to initialize these as a group.
    136   struct Allocators {
    137     using NodeAllocatorType = NodeArray::AllocatorType;
    138     using RootAllocatorType = RootArray::AllocatorType;
    139     using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;
    140 
    141     // Use hosted aligned storage members to allow for trivial move and init.
    142     // This also allows us to sidestep the potential-failing allocation issue.
    143     typename std::aligned_storage<sizeof(NodeAllocatorType),
    144                                   alignof(NodeAllocatorType)>::type
    145         NodeAllocatorStorage;
    146     typename std::aligned_storage<sizeof(RootAllocatorType),
    147                                   alignof(RootAllocatorType)>::type
    148         RootAllocatorStorage;
    149     typename std::aligned_storage<sizeof(ShadowStackAllocatorType),
    150                                   alignof(ShadowStackAllocatorType)>::type
    151         ShadowStackAllocatorStorage;
    152     typename std::aligned_storage<sizeof(NodeIdPairAllocatorType),
    153                                   alignof(NodeIdPairAllocatorType)>::type
    154         NodeIdPairAllocatorStorage;
    155 
    156     NodeAllocatorType *NodeAllocator = nullptr;
    157     RootAllocatorType *RootAllocator = nullptr;
    158     ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
    159     NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
    160 
    161     Allocators() = default;
    162     Allocators(const Allocators &) = delete;
    163     Allocators &operator=(const Allocators &) = delete;
    164 
    165     struct Buffers {
    166       BufferQueue::Buffer NodeBuffer;
    167       BufferQueue::Buffer RootsBuffer;
    168       BufferQueue::Buffer ShadowStackBuffer;
    169       BufferQueue::Buffer NodeIdPairBuffer;
    170     };
    171 
    172     explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT {
    173       new (&NodeAllocatorStorage)
    174           NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size);
    175       NodeAllocator =
    176           reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
    177 
    178       new (&RootAllocatorStorage)
    179           RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size);
    180       RootAllocator =
    181           reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
    182 
    183       new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(
    184           B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size);
    185       ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
    186           &ShadowStackAllocatorStorage);
    187 
    188       new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(
    189           B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size);
    190       NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
    191           &NodeIdPairAllocatorStorage);
    192     }
    193 
    194     explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT {
    195       new (&NodeAllocatorStorage) NodeAllocatorType(Max);
    196       NodeAllocator =
    197           reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
    198 
    199       new (&RootAllocatorStorage) RootAllocatorType(Max);
    200       RootAllocator =
    201           reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
    202 
    203       new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(Max);
    204       ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
    205           &ShadowStackAllocatorStorage);
    206 
    207       new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(Max);
    208       NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
    209           &NodeIdPairAllocatorStorage);
    210     }
    211 
    212     Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT {
    213       // Here we rely on the safety of memcpy'ing contents of the storage
    214       // members, and then pointing the source pointers to nullptr.
    215       internal_memcpy(&NodeAllocatorStorage, &O.NodeAllocatorStorage,
    216                       sizeof(NodeAllocatorType));
    217       internal_memcpy(&RootAllocatorStorage, &O.RootAllocatorStorage,
    218                       sizeof(RootAllocatorType));
    219       internal_memcpy(&ShadowStackAllocatorStorage,
    220                       &O.ShadowStackAllocatorStorage,
    221                       sizeof(ShadowStackAllocatorType));
    222       internal_memcpy(&NodeIdPairAllocatorStorage,
    223                       &O.NodeIdPairAllocatorStorage,
    224                       sizeof(NodeIdPairAllocatorType));
    225 
    226       NodeAllocator =
    227           reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
    228       RootAllocator =
    229           reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
    230       ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
    231           &ShadowStackAllocatorStorage);
    232       NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
    233           &NodeIdPairAllocatorStorage);
    234 
    235       O.NodeAllocator = nullptr;
    236       O.RootAllocator = nullptr;
    237       O.ShadowStackAllocator = nullptr;
    238       O.NodeIdPairAllocator = nullptr;
    239     }
    240 
    241     Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT {
    242       // When moving into an existing instance, we ensure that we clean up the
    243       // current allocators.
    244       if (NodeAllocator)
    245         NodeAllocator->~NodeAllocatorType();
    246       if (O.NodeAllocator) {
    247         new (&NodeAllocatorStorage)
    248             NodeAllocatorType(std::move(*O.NodeAllocator));
    249         NodeAllocator =
    250             reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
    251         O.NodeAllocator = nullptr;
    252       } else {
    253         NodeAllocator = nullptr;
    254       }
    255 
    256       if (RootAllocator)
    257         RootAllocator->~RootAllocatorType();
    258       if (O.RootAllocator) {
    259         new (&RootAllocatorStorage)
    260             RootAllocatorType(std::move(*O.RootAllocator));
    261         RootAllocator =
    262             reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
    263         O.RootAllocator = nullptr;
    264       } else {
    265         RootAllocator = nullptr;
    266       }
    267 
    268       if (ShadowStackAllocator)
    269         ShadowStackAllocator->~ShadowStackAllocatorType();
    270       if (O.ShadowStackAllocator) {
    271         new (&ShadowStackAllocatorStorage)
    272             ShadowStackAllocatorType(std::move(*O.ShadowStackAllocator));
    273         ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
    274             &ShadowStackAllocatorStorage);
    275         O.ShadowStackAllocator = nullptr;
    276       } else {
    277         ShadowStackAllocator = nullptr;
    278       }
    279 
    280       if (NodeIdPairAllocator)
    281         NodeIdPairAllocator->~NodeIdPairAllocatorType();
    282       if (O.NodeIdPairAllocator) {
    283         new (&NodeIdPairAllocatorStorage)
    284             NodeIdPairAllocatorType(std::move(*O.NodeIdPairAllocator));
    285         NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
    286             &NodeIdPairAllocatorStorage);
    287         O.NodeIdPairAllocator = nullptr;
    288       } else {
    289         NodeIdPairAllocator = nullptr;
    290       }
    291 
    292       return *this;
    293     }
    294 
    295     ~Allocators() XRAY_NEVER_INSTRUMENT {
    296       if (NodeAllocator != nullptr)
    297         NodeAllocator->~NodeAllocatorType();
    298       if (RootAllocator != nullptr)
    299         RootAllocator->~RootAllocatorType();
    300       if (ShadowStackAllocator != nullptr)
    301         ShadowStackAllocator->~ShadowStackAllocatorType();
    302       if (NodeIdPairAllocator != nullptr)
    303         NodeIdPairAllocator->~NodeIdPairAllocatorType();
    304     }
    305   };
    306 
    307   static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT {
    308     return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
    309   }
    310 
    311   static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT {
    312     Allocators A(Max);
    313     return A;
    314   }
    315 
    316   static Allocators
    317   InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT {
    318     Allocators A(Bufs);
    319     return A;
    320   }
    321 
    322 private:
    323   NodeArray Nodes;
    324   RootArray Roots;
    325   ShadowStackArray ShadowStack;
    326   NodeIdPairAllocatorType *NodeIdPairAllocator;
    327   uint32_t OverflowedFunctions;
    328 
    329 public:
    330   explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT
    331       : Nodes(*A.NodeAllocator),
    332         Roots(*A.RootAllocator),
    333         ShadowStack(*A.ShadowStackAllocator),
    334         NodeIdPairAllocator(A.NodeIdPairAllocator),
    335         OverflowedFunctions(0) {}
    336 
    337   FunctionCallTrie() = delete;
    338   FunctionCallTrie(const FunctionCallTrie &) = delete;
    339   FunctionCallTrie &operator=(const FunctionCallTrie &) = delete;
    340 
    341   FunctionCallTrie(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT
    342       : Nodes(std::move(O.Nodes)),
    343         Roots(std::move(O.Roots)),
    344         ShadowStack(std::move(O.ShadowStack)),
    345         NodeIdPairAllocator(O.NodeIdPairAllocator),
    346         OverflowedFunctions(O.OverflowedFunctions) {}
    347 
    348   FunctionCallTrie &operator=(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT {
    349     Nodes = std::move(O.Nodes);
    350     Roots = std::move(O.Roots);
    351     ShadowStack = std::move(O.ShadowStack);
    352     NodeIdPairAllocator = O.NodeIdPairAllocator;
    353     OverflowedFunctions = O.OverflowedFunctions;
    354     return *this;
    355   }
    356 
    357   ~FunctionCallTrie() XRAY_NEVER_INSTRUMENT {}
    358 
    359   void enterFunction(const int32_t FId, uint64_t TSC,
    360                      uint16_t CPU) XRAY_NEVER_INSTRUMENT {
    361     DCHECK_NE(FId, 0);
    362 
    363     // If we're already overflowed the function call stack, do not bother
    364     // attempting to record any more function entries.
    365     if (UNLIKELY(OverflowedFunctions)) {
    366       ++OverflowedFunctions;
    367       return;
    368     }
    369 
    370     // If this is the first function we've encountered, we want to set up the
    371     // node(s) and treat it as a root.
    372     if (UNLIKELY(ShadowStack.empty())) {
    373       auto *NewRoot = Nodes.AppendEmplace(
    374           nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
    375       if (UNLIKELY(NewRoot == nullptr))
    376         return;
    377       if (Roots.AppendEmplace(NewRoot) == nullptr) {
    378         Nodes.trim(1);
    379         return;
    380       }
    381       if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) {
    382         Nodes.trim(1);
    383         Roots.trim(1);
    384         ++OverflowedFunctions;
    385         return;
    386       }
    387       return;
    388     }
    389 
    390     // From this point on, we require that the stack is not empty.
    391     DCHECK(!ShadowStack.empty());
    392     auto TopNode = ShadowStack.back().NodePtr;
    393     DCHECK_NE(TopNode, nullptr);
    394 
    395     // If we've seen this callee before, then we access that node and place that
    396     // on the top of the stack.
    397     auto* Callee = TopNode->Callees.find_element(
    398         [FId](const NodeIdPair &NR) { return NR.FId == FId; });
    399     if (Callee != nullptr) {
    400       CHECK_NE(Callee->NodePtr, nullptr);
    401       if (ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU) == nullptr)
    402         ++OverflowedFunctions;
    403       return;
    404     }
    405 
    406     // This means we've never seen this stack before, create a new node here.
    407     auto* NewNode = Nodes.AppendEmplace(
    408         TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
    409     if (UNLIKELY(NewNode == nullptr))
    410       return;
    411     DCHECK_NE(NewNode, nullptr);
    412     TopNode->Callees.AppendEmplace(NewNode, FId);
    413     if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr)
    414       ++OverflowedFunctions;
    415     return;
    416   }
    417 
    418   void exitFunction(int32_t FId, uint64_t TSC,
    419                     uint16_t CPU) XRAY_NEVER_INSTRUMENT {
    420     // If we're exiting functions that have "overflowed" or don't fit into the
    421     // stack due to allocator constraints, we then decrement that count first.
    422     if (OverflowedFunctions) {
    423       --OverflowedFunctions;
    424       return;
    425     }
    426 
    427     // When we exit a function, we look up the ShadowStack to see whether we've
    428     // entered this function before. We do as little processing here as we can,
    429     // since most of the hard work would have already been done at function
    430     // entry.
    431     uint64_t CumulativeTreeTime = 0;
    432 
    433     while (!ShadowStack.empty()) {
    434       const auto &Top = ShadowStack.back();
    435       auto TopNode = Top.NodePtr;
    436       DCHECK_NE(TopNode, nullptr);
    437 
    438       // We may encounter overflow on the TSC we're provided, which may end up
    439       // being less than the TSC when we first entered the function.
    440       //
    441       // To get the accurate measurement of cycles, we need to check whether
    442       // we've overflowed (TSC < Top.EntryTSC) and then account the difference
    443       // between the entry TSC and the max for the TSC counter (max of uint64_t)
    444       // then add the value of TSC. We can prove that the maximum delta we will
    445       // get is at most the 64-bit unsigned value, since the difference between
    446       // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
    447       // - 1) + 1.
    448       //
    449       // NOTE: This assumes that TSCs are synchronised across CPUs.
    450       // TODO: Count the number of times we've seen CPU migrations.
    451       uint64_t LocalTime =
    452           Top.EntryTSC > TSC
    453               ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC
    454               : TSC - Top.EntryTSC;
    455       TopNode->CallCount++;
    456       TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
    457       CumulativeTreeTime += LocalTime;
    458       ShadowStack.trim(1);
    459 
    460       // TODO: Update the histogram for the node.
    461       if (TopNode->FId == FId)
    462         break;
    463     }
    464   }
    465 
    466   const RootArray &getRoots() const XRAY_NEVER_INSTRUMENT { return Roots; }
    467 
    468   // The deepCopyInto operation will update the provided FunctionCallTrie by
    469   // re-creating the contents of this particular FunctionCallTrie in the other
    470   // FunctionCallTrie. It will do this using a Depth First Traversal from the
    471   // roots, and while doing so recreating the traversal in the provided
    472   // FunctionCallTrie.
    473   //
    474   // This operation will *not* destroy the state in `O`, and thus may cause some
    475   // duplicate entries in `O` if it is not empty.
    476   //
    477   // This function is *not* thread-safe, and may require external
    478   // synchronisation of both "this" and |O|.
    479   //
    480   // This function must *not* be called with a non-empty FunctionCallTrie |O|.
    481   void deepCopyInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
    482     DCHECK(O.getRoots().empty());
    483 
    484     // We then push the root into a stack, to use as the parent marker for new
    485     // nodes we push in as we're traversing depth-first down the call tree.
    486     struct NodeAndParent {
    487       FunctionCallTrie::Node *Node;
    488       FunctionCallTrie::Node *NewNode;
    489     };
    490     using Stack = Array<NodeAndParent>;
    491 
    492     typename Stack::AllocatorType StackAllocator(
    493         profilingFlags()->stack_allocator_max);
    494     Stack DFSStack(StackAllocator);
    495 
    496     for (const auto Root : getRoots()) {
    497       // Add a node in O for this root.
    498       auto NewRoot = O.Nodes.AppendEmplace(
    499           nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), Root->CallCount,
    500           Root->CumulativeLocalTime, Root->FId);
    501 
    502       // Because we cannot allocate more memory we should bail out right away.
    503       if (UNLIKELY(NewRoot == nullptr))
    504         return;
    505 
    506       if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr))
    507         return;
    508 
    509       // TODO: Figure out what to do if we fail to allocate any more stack
    510       // space. Maybe warn or report once?
    511       if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr)
    512         return;
    513       while (!DFSStack.empty()) {
    514         NodeAndParent NP = DFSStack.back();
    515         DCHECK_NE(NP.Node, nullptr);
    516         DCHECK_NE(NP.NewNode, nullptr);
    517         DFSStack.trim(1);
    518         for (const auto Callee : NP.Node->Callees) {
    519           auto NewNode = O.Nodes.AppendEmplace(
    520               NP.NewNode, NodeIdPairArray(*O.NodeIdPairAllocator),
    521               Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,
    522               Callee.FId);
    523           if (UNLIKELY(NewNode == nullptr))
    524             return;
    525           if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) ==
    526                        nullptr))
    527             return;
    528           if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) ==
    529                        nullptr))
    530             return;
    531         }
    532       }
    533     }
    534   }
    535 
    536   // The mergeInto operation will update the provided FunctionCallTrie by
    537   // traversing the current trie's roots and updating (i.e. merging) the data in
    538   // the nodes with the data in the target's nodes. If the node doesn't exist in
    539   // the provided trie, we add a new one in the right position, and inherit the
    540   // data from the original (current) trie, along with all its callees.
    541   //
    542   // This function is *not* thread-safe, and may require external
    543   // synchronisation of both "this" and |O|.
    544   void mergeInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
    545     struct NodeAndTarget {
    546       FunctionCallTrie::Node *OrigNode;
    547       FunctionCallTrie::Node *TargetNode;
    548     };
    549     using Stack = Array<NodeAndTarget>;
    550     typename Stack::AllocatorType StackAllocator(
    551         profilingFlags()->stack_allocator_max);
    552     Stack DFSStack(StackAllocator);
    553 
    554     for (const auto Root : getRoots()) {
    555       Node *TargetRoot = nullptr;
    556       auto R = O.Roots.find_element(
    557           [&](const Node *Node) { return Node->FId == Root->FId; });
    558       if (R == nullptr) {
    559         TargetRoot = O.Nodes.AppendEmplace(
    560             nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
    561             Root->FId);
    562         if (UNLIKELY(TargetRoot == nullptr))
    563           return;
    564 
    565         O.Roots.Append(TargetRoot);
    566       } else {
    567         TargetRoot = *R;
    568       }
    569 
    570       DFSStack.AppendEmplace(Root, TargetRoot);
    571       while (!DFSStack.empty()) {
    572         NodeAndTarget NT = DFSStack.back();
    573         DCHECK_NE(NT.OrigNode, nullptr);
    574         DCHECK_NE(NT.TargetNode, nullptr);
    575         DFSStack.trim(1);
    576         // TODO: Update the histogram as well when we have it ready.
    577         NT.TargetNode->CallCount += NT.OrigNode->CallCount;
    578         NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime;
    579         for (const auto Callee : NT.OrigNode->Callees) {
    580           auto TargetCallee = NT.TargetNode->Callees.find_element(
    581               [&](const FunctionCallTrie::NodeIdPair &C) {
    582                 return C.FId == Callee.FId;
    583               });
    584           if (TargetCallee == nullptr) {
    585             auto NewTargetNode = O.Nodes.AppendEmplace(
    586                 NT.TargetNode, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
    587                 Callee.FId);
    588 
    589             if (UNLIKELY(NewTargetNode == nullptr))
    590               return;
    591 
    592             TargetCallee =
    593                 NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
    594           }
    595           DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr);
    596         }
    597       }
    598     }
    599   }
    600 };
    601 
    602 } // namespace __xray
    603 
    604 #endif // XRAY_FUNCTION_CALL_TRIE_H
    605