Home | History | Annotate | Line # | Download | only in Serialization
      1  1.1  joerg //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===//
      2  1.1  joerg //
      3  1.1  joerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  1.1  joerg // See https://llvm.org/LICENSE.txt for license information.
      5  1.1  joerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  1.1  joerg //
      7  1.1  joerg //===----------------------------------------------------------------------===//
      8  1.1  joerg //
      9  1.1  joerg //  This file provides a hash table data structure suitable for incremental and
     10  1.1  joerg //  distributed storage across a set of files.
     11  1.1  joerg //
     12  1.1  joerg //  Multiple hash tables from different files are implicitly merged to improve
     13  1.1  joerg //  performance, and on reload the merged table will override those from other
     14  1.1  joerg //  files.
     15  1.1  joerg //
     16  1.1  joerg //===----------------------------------------------------------------------===//
     17  1.1  joerg 
     18  1.1  joerg #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
     19  1.1  joerg #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
     20  1.1  joerg 
     21  1.1  joerg #include "llvm/ADT/DenseMap.h"
     22  1.1  joerg #include "llvm/ADT/DenseSet.h"
     23  1.1  joerg #include "llvm/ADT/PointerUnion.h"
     24  1.1  joerg #include "llvm/ADT/STLExtras.h"
     25  1.1  joerg #include "llvm/ADT/SmallVector.h"
     26  1.1  joerg #include "llvm/ADT/TinyPtrVector.h"
     27  1.1  joerg #include "llvm/ADT/iterator_range.h"
     28  1.1  joerg #include "llvm/Support/Endian.h"
     29  1.1  joerg #include "llvm/Support/EndianStream.h"
     30  1.1  joerg #include "llvm/Support/OnDiskHashTable.h"
     31  1.1  joerg #include "llvm/Support/raw_ostream.h"
     32  1.1  joerg #include <algorithm>
     33  1.1  joerg #include <cstdint>
     34  1.1  joerg #include <vector>
     35  1.1  joerg 
     36  1.1  joerg namespace clang {
     37  1.1  joerg namespace serialization {
     38  1.1  joerg 
     39  1.1  joerg /// A collection of on-disk hash tables, merged when relevant for performance.
     40  1.1  joerg template<typename Info> class MultiOnDiskHashTable {
     41  1.1  joerg public:
     42  1.1  joerg   /// A handle to a file, used when overriding tables.
     43  1.1  joerg   using file_type = typename Info::file_type;
     44  1.1  joerg 
     45  1.1  joerg   /// A pointer to an on-disk representation of the hash table.
     46  1.1  joerg   using storage_type = const unsigned char *;
     47  1.1  joerg 
     48  1.1  joerg   using external_key_type = typename Info::external_key_type;
     49  1.1  joerg   using internal_key_type = typename Info::internal_key_type;
     50  1.1  joerg   using data_type = typename Info::data_type;
     51  1.1  joerg   using data_type_builder = typename Info::data_type_builder;
     52  1.1  joerg   using hash_value_type = unsigned;
     53  1.1  joerg 
     54  1.1  joerg private:
     55  1.1  joerg   /// The generator is permitted to read our merged table.
     56  1.1  joerg   template<typename ReaderInfo, typename WriterInfo>
     57  1.1  joerg   friend class MultiOnDiskHashTableGenerator;
     58  1.1  joerg 
     59  1.1  joerg   /// A hash table stored on disk.
     60  1.1  joerg   struct OnDiskTable {
     61  1.1  joerg     using HashTable = llvm::OnDiskIterableChainedHashTable<Info>;
     62  1.1  joerg 
     63  1.1  joerg     file_type File;
     64  1.1  joerg     HashTable Table;
     65  1.1  joerg 
     66  1.1  joerg     OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries,
     67  1.1  joerg                 storage_type Buckets, storage_type Payload, storage_type Base,
     68  1.1  joerg                 const Info &InfoObj)
     69  1.1  joerg         : File(File),
     70  1.1  joerg           Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {}
     71  1.1  joerg   };
     72  1.1  joerg 
     73  1.1  joerg   struct MergedTable {
     74  1.1  joerg     std::vector<file_type> Files;
     75  1.1  joerg     llvm::DenseMap<internal_key_type, data_type> Data;
     76  1.1  joerg   };
     77  1.1  joerg 
     78  1.1  joerg   using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>;
     79  1.1  joerg   using TableVector = llvm::TinyPtrVector<void *>;
     80  1.1  joerg 
     81  1.1  joerg   /// The current set of on-disk and merged tables.
     82  1.1  joerg   /// We manually store the opaque value of the Table because TinyPtrVector
     83  1.1  joerg   /// can't cope with holding a PointerUnion directly.
     84  1.1  joerg   /// There can be at most one MergedTable in this vector, and if present,
     85  1.1  joerg   /// it is the first table.
     86  1.1  joerg   TableVector Tables;
     87  1.1  joerg 
     88  1.1  joerg   /// Files corresponding to overridden tables that we've not yet
     89  1.1  joerg   /// discarded.
     90  1.1  joerg   llvm::TinyPtrVector<file_type> PendingOverrides;
     91  1.1  joerg 
     92  1.1  joerg   struct AsOnDiskTable {
     93  1.1  joerg     using result_type = OnDiskTable *;
     94  1.1  joerg 
     95  1.1  joerg     result_type operator()(void *P) const {
     96  1.1  joerg       return Table::getFromOpaqueValue(P).template get<OnDiskTable *>();
     97  1.1  joerg     }
     98  1.1  joerg   };
     99  1.1  joerg 
    100  1.1  joerg   using table_iterator =
    101  1.1  joerg       llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>;
    102  1.1  joerg   using table_range = llvm::iterator_range<table_iterator>;
    103  1.1  joerg 
    104  1.1  joerg   /// The current set of on-disk tables.
    105  1.1  joerg   table_range tables() {
    106  1.1  joerg     auto Begin = Tables.begin(), End = Tables.end();
    107  1.1  joerg     if (getMergedTable())
    108  1.1  joerg       ++Begin;
    109  1.1  joerg     return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()),
    110  1.1  joerg                             llvm::map_iterator(End, AsOnDiskTable()));
    111  1.1  joerg   }
    112  1.1  joerg 
    113  1.1  joerg   MergedTable *getMergedTable() const {
    114  1.1  joerg     // If we already have a merged table, it's the first one.
    115  1.1  joerg     return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin())
    116  1.1  joerg                                           .template dyn_cast<MergedTable*>();
    117  1.1  joerg   }
    118  1.1  joerg 
    119  1.1  joerg   /// Delete all our current on-disk tables.
    120  1.1  joerg   void clear() {
    121  1.1  joerg     for (auto *T : tables())
    122  1.1  joerg       delete T;
    123  1.1  joerg     if (auto *M = getMergedTable())
    124  1.1  joerg       delete M;
    125  1.1  joerg     Tables.clear();
    126  1.1  joerg   }
    127  1.1  joerg 
    128  1.1  joerg   void removeOverriddenTables() {
    129  1.1  joerg     llvm::DenseSet<file_type> Files;
    130  1.1  joerg     Files.insert(PendingOverrides.begin(), PendingOverrides.end());
    131  1.1  joerg     // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug.
    132  1.1  joerg     auto ShouldRemove = [&Files](void *T) -> bool {
    133  1.1  joerg       auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>();
    134  1.1  joerg       bool Remove = Files.count(ODT->File);
    135  1.1  joerg       if (Remove)
    136  1.1  joerg         delete ODT;
    137  1.1  joerg       return Remove;
    138  1.1  joerg     };
    139  1.1  joerg     Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(),
    140  1.1  joerg                                 ShouldRemove),
    141  1.1  joerg                  Tables.end());
    142  1.1  joerg     PendingOverrides.clear();
    143  1.1  joerg   }
    144  1.1  joerg 
    145  1.1  joerg   void condense() {
    146  1.1  joerg     MergedTable *Merged = getMergedTable();
    147  1.1  joerg     if (!Merged)
    148  1.1  joerg       Merged = new MergedTable;
    149  1.1  joerg 
    150  1.1  joerg     // Read in all the tables and merge them together.
    151  1.1  joerg     // FIXME: Be smarter about which tables we merge.
    152  1.1  joerg     for (auto *ODT : tables()) {
    153  1.1  joerg       auto &HT = ODT->Table;
    154  1.1  joerg       Info &InfoObj = HT.getInfoObj();
    155  1.1  joerg 
    156  1.1  joerg       for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
    157  1.1  joerg         auto *LocalPtr = I.getItem();
    158  1.1  joerg 
    159  1.1  joerg         // FIXME: Don't rely on the OnDiskHashTable format here.
    160  1.1  joerg         auto L = InfoObj.ReadKeyDataLength(LocalPtr);
    161  1.1  joerg         const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
    162  1.1  joerg         data_type_builder ValueBuilder(Merged->Data[Key]);
    163  1.1  joerg         InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
    164  1.1  joerg                              ValueBuilder);
    165  1.1  joerg       }
    166  1.1  joerg 
    167  1.1  joerg       Merged->Files.push_back(ODT->File);
    168  1.1  joerg       delete ODT;
    169  1.1  joerg     }
    170  1.1  joerg 
    171  1.1  joerg     Tables.clear();
    172  1.1  joerg     Tables.push_back(Table(Merged).getOpaqueValue());
    173  1.1  joerg   }
    174  1.1  joerg 
    175  1.1  joerg public:
    176  1.1  joerg   MultiOnDiskHashTable() = default;
    177  1.1  joerg 
    178  1.1  joerg   MultiOnDiskHashTable(MultiOnDiskHashTable &&O)
    179  1.1  joerg       : Tables(std::move(O.Tables)),
    180  1.1  joerg         PendingOverrides(std::move(O.PendingOverrides)) {
    181  1.1  joerg     O.Tables.clear();
    182  1.1  joerg   }
    183  1.1  joerg 
    184  1.1  joerg   MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) {
    185  1.1  joerg     if (&O == this)
    186  1.1  joerg       return *this;
    187  1.1  joerg     clear();
    188  1.1  joerg     Tables = std::move(O.Tables);
    189  1.1  joerg     O.Tables.clear();
    190  1.1  joerg     PendingOverrides = std::move(O.PendingOverrides);
    191  1.1  joerg     return *this;
    192  1.1  joerg   }
    193  1.1  joerg 
    194  1.1  joerg   ~MultiOnDiskHashTable() { clear(); }
    195  1.1  joerg 
    196  1.1  joerg   /// Add the table \p Data loaded from file \p File.
    197  1.1  joerg   void add(file_type File, storage_type Data, Info InfoObj = Info()) {
    198  1.1  joerg     using namespace llvm::support;
    199  1.1  joerg 
    200  1.1  joerg     storage_type Ptr = Data;
    201  1.1  joerg 
    202  1.1  joerg     uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr);
    203  1.1  joerg 
    204  1.1  joerg     // Read the list of overridden files.
    205  1.1  joerg     uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr);
    206  1.1  joerg     // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
    207  1.1  joerg     // an additional copy.
    208  1.1  joerg     llvm::SmallVector<file_type, 16> OverriddenFiles;
    209  1.1  joerg     OverriddenFiles.reserve(NumFiles);
    210  1.1  joerg     for (/**/; NumFiles != 0; --NumFiles)
    211  1.1  joerg       OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr));
    212  1.1  joerg     PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(),
    213  1.1  joerg                             OverriddenFiles.end());
    214  1.1  joerg 
    215  1.1  joerg     // Read the OnDiskChainedHashTable header.
    216  1.1  joerg     storage_type Buckets = Data + BucketOffset;
    217  1.1  joerg     auto NumBucketsAndEntries =
    218  1.1  joerg         OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets);
    219  1.1  joerg 
    220  1.1  joerg     // Register the table.
    221  1.1  joerg     Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first,
    222  1.1  joerg                                      NumBucketsAndEntries.second,
    223  1.1  joerg                                      Buckets, Ptr, Data, std::move(InfoObj));
    224  1.1  joerg     Tables.push_back(NewTable.getOpaqueValue());
    225  1.1  joerg   }
    226  1.1  joerg 
    227  1.1  joerg   /// Find and read the lookup results for \p EKey.
    228  1.1  joerg   data_type find(const external_key_type &EKey) {
    229  1.1  joerg     data_type Result;
    230  1.1  joerg 
    231  1.1  joerg     if (!PendingOverrides.empty())
    232  1.1  joerg       removeOverriddenTables();
    233  1.1  joerg 
    234  1.1  joerg     if (Tables.size() > static_cast<unsigned>(Info::MaxTables))
    235  1.1  joerg       condense();
    236  1.1  joerg 
    237  1.1  joerg     internal_key_type Key = Info::GetInternalKey(EKey);
    238  1.1  joerg     auto KeyHash = Info::ComputeHash(Key);
    239  1.1  joerg 
    240  1.1  joerg     if (MergedTable *M = getMergedTable()) {
    241  1.1  joerg       auto It = M->Data.find(Key);
    242  1.1  joerg       if (It != M->Data.end())
    243  1.1  joerg         Result = It->second;
    244  1.1  joerg     }
    245  1.1  joerg 
    246  1.1  joerg     data_type_builder ResultBuilder(Result);
    247  1.1  joerg 
    248  1.1  joerg     for (auto *ODT : tables()) {
    249  1.1  joerg       auto &HT = ODT->Table;
    250  1.1  joerg       auto It = HT.find_hashed(Key, KeyHash);
    251  1.1  joerg       if (It != HT.end())
    252  1.1  joerg         HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(),
    253  1.1  joerg                                      ResultBuilder);
    254  1.1  joerg     }
    255  1.1  joerg 
    256  1.1  joerg     return Result;
    257  1.1  joerg   }
    258  1.1  joerg 
    259  1.1  joerg   /// Read all the lookup results into a single value. This only makes
    260  1.1  joerg   /// sense if merging values across keys is meaningful.
    261  1.1  joerg   data_type findAll() {
    262  1.1  joerg     data_type Result;
    263  1.1  joerg     data_type_builder ResultBuilder(Result);
    264  1.1  joerg 
    265  1.1  joerg     if (!PendingOverrides.empty())
    266  1.1  joerg       removeOverriddenTables();
    267  1.1  joerg 
    268  1.1  joerg     if (MergedTable *M = getMergedTable()) {
    269  1.1  joerg       for (auto &KV : M->Data)
    270  1.1  joerg         Info::MergeDataInto(KV.second, ResultBuilder);
    271  1.1  joerg     }
    272  1.1  joerg 
    273  1.1  joerg     for (auto *ODT : tables()) {
    274  1.1  joerg       auto &HT = ODT->Table;
    275  1.1  joerg       Info &InfoObj = HT.getInfoObj();
    276  1.1  joerg       for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
    277  1.1  joerg         auto *LocalPtr = I.getItem();
    278  1.1  joerg 
    279  1.1  joerg         // FIXME: Don't rely on the OnDiskHashTable format here.
    280  1.1  joerg         auto L = InfoObj.ReadKeyDataLength(LocalPtr);
    281  1.1  joerg         const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
    282  1.1  joerg         InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder);
    283  1.1  joerg       }
    284  1.1  joerg     }
    285  1.1  joerg 
    286  1.1  joerg     return Result;
    287  1.1  joerg   }
    288  1.1  joerg };
    289  1.1  joerg 
    290  1.1  joerg /// Writer for the on-disk hash table.
    291  1.1  joerg template<typename ReaderInfo, typename WriterInfo>
    292  1.1  joerg class MultiOnDiskHashTableGenerator {
    293  1.1  joerg   using BaseTable = MultiOnDiskHashTable<ReaderInfo>;
    294  1.1  joerg   using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>;
    295  1.1  joerg 
    296  1.1  joerg   Generator Gen;
    297  1.1  joerg 
    298  1.1  joerg public:
    299  1.1  joerg   MultiOnDiskHashTableGenerator() : Gen() {}
    300  1.1  joerg 
    301  1.1  joerg   void insert(typename WriterInfo::key_type_ref Key,
    302  1.1  joerg               typename WriterInfo::data_type_ref Data, WriterInfo &Info) {
    303  1.1  joerg     Gen.insert(Key, Data, Info);
    304  1.1  joerg   }
    305  1.1  joerg 
    306  1.1  joerg   void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info,
    307  1.1  joerg             const BaseTable *Base) {
    308  1.1  joerg     using namespace llvm::support;
    309  1.1  joerg 
    310  1.1  joerg     llvm::raw_svector_ostream OutStream(Out);
    311  1.1  joerg 
    312  1.1  joerg     // Write our header information.
    313  1.1  joerg     {
    314  1.1  joerg       endian::Writer Writer(OutStream, little);
    315  1.1  joerg 
    316  1.1  joerg       // Reserve four bytes for the bucket offset.
    317  1.1  joerg       Writer.write<uint32_t>(0);
    318  1.1  joerg 
    319  1.1  joerg       if (auto *Merged = Base ? Base->getMergedTable() : nullptr) {
    320  1.1  joerg         // Write list of overridden files.
    321  1.1  joerg         Writer.write<uint32_t>(Merged->Files.size());
    322  1.1  joerg         for (const auto &F : Merged->Files)
    323  1.1  joerg           Info.EmitFileRef(OutStream, F);
    324  1.1  joerg 
    325  1.1  joerg         // Add all merged entries from Base to the generator.
    326  1.1  joerg         for (auto &KV : Merged->Data) {
    327  1.1  joerg           if (!Gen.contains(KV.first, Info))
    328  1.1  joerg             Gen.insert(KV.first, Info.ImportData(KV.second), Info);
    329  1.1  joerg         }
    330  1.1  joerg       } else {
    331  1.1  joerg         Writer.write<uint32_t>(0);
    332  1.1  joerg       }
    333  1.1  joerg     }
    334  1.1  joerg 
    335  1.1  joerg     // Write the table itself.
    336  1.1  joerg     uint32_t BucketOffset = Gen.Emit(OutStream, Info);
    337  1.1  joerg 
    338  1.1  joerg     // Replace the first four bytes with the bucket offset.
    339  1.1  joerg     endian::write32le(Out.data(), BucketOffset);
    340  1.1  joerg   }
    341  1.1  joerg };
    342  1.1  joerg 
    343  1.1  joerg } // namespace serialization
    344  1.1  joerg } // namespace clang
    345  1.1  joerg 
    346  1.1  joerg #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
    347