Home | History | Annotate | Line # | Download | only in Serialization
MultiOnDiskHashTable.h revision 1.1
      1 //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 //  This file provides a hash table data structure suitable for incremental and
     10 //  distributed storage across a set of files.
     11 //
     12 //  Multiple hash tables from different files are implicitly merged to improve
     13 //  performance, and on reload the merged table will override those from other
     14 //  files.
     15 //
     16 //===----------------------------------------------------------------------===//
     17 
     18 #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
     19 #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
     20 
     21 #include "llvm/ADT/DenseMap.h"
     22 #include "llvm/ADT/DenseSet.h"
     23 #include "llvm/ADT/PointerUnion.h"
     24 #include "llvm/ADT/STLExtras.h"
     25 #include "llvm/ADT/SmallVector.h"
     26 #include "llvm/ADT/TinyPtrVector.h"
     27 #include "llvm/ADT/iterator_range.h"
     28 #include "llvm/Support/Endian.h"
     29 #include "llvm/Support/EndianStream.h"
     30 #include "llvm/Support/OnDiskHashTable.h"
     31 #include "llvm/Support/raw_ostream.h"
     32 #include <algorithm>
     33 #include <cstdint>
     34 #include <vector>
     35 
     36 namespace clang {
     37 namespace serialization {
     38 
     39 /// A collection of on-disk hash tables, merged when relevant for performance.
     40 template<typename Info> class MultiOnDiskHashTable {
     41 public:
     42   /// A handle to a file, used when overriding tables.
     43   using file_type = typename Info::file_type;
     44 
     45   /// A pointer to an on-disk representation of the hash table.
     46   using storage_type = const unsigned char *;
     47 
     48   using external_key_type = typename Info::external_key_type;
     49   using internal_key_type = typename Info::internal_key_type;
     50   using data_type = typename Info::data_type;
     51   using data_type_builder = typename Info::data_type_builder;
     52   using hash_value_type = unsigned;
     53 
     54 private:
     55   /// The generator is permitted to read our merged table.
     56   template<typename ReaderInfo, typename WriterInfo>
     57   friend class MultiOnDiskHashTableGenerator;
     58 
     59   /// A hash table stored on disk.
     60   struct OnDiskTable {
     61     using HashTable = llvm::OnDiskIterableChainedHashTable<Info>;
     62 
     63     file_type File;
     64     HashTable Table;
     65 
     66     OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries,
     67                 storage_type Buckets, storage_type Payload, storage_type Base,
     68                 const Info &InfoObj)
     69         : File(File),
     70           Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {}
     71   };
     72 
     73   struct MergedTable {
     74     std::vector<file_type> Files;
     75     llvm::DenseMap<internal_key_type, data_type> Data;
     76   };
     77 
     78   using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>;
     79   using TableVector = llvm::TinyPtrVector<void *>;
     80 
     81   /// The current set of on-disk and merged tables.
     82   /// We manually store the opaque value of the Table because TinyPtrVector
     83   /// can't cope with holding a PointerUnion directly.
     84   /// There can be at most one MergedTable in this vector, and if present,
     85   /// it is the first table.
     86   TableVector Tables;
     87 
     88   /// Files corresponding to overridden tables that we've not yet
     89   /// discarded.
     90   llvm::TinyPtrVector<file_type> PendingOverrides;
     91 
     92   struct AsOnDiskTable {
     93     using result_type = OnDiskTable *;
     94 
     95     result_type operator()(void *P) const {
     96       return Table::getFromOpaqueValue(P).template get<OnDiskTable *>();
     97     }
     98   };
     99 
    100   using table_iterator =
    101       llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>;
    102   using table_range = llvm::iterator_range<table_iterator>;
    103 
    104   /// The current set of on-disk tables.
    105   table_range tables() {
    106     auto Begin = Tables.begin(), End = Tables.end();
    107     if (getMergedTable())
    108       ++Begin;
    109     return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()),
    110                             llvm::map_iterator(End, AsOnDiskTable()));
    111   }
    112 
    113   MergedTable *getMergedTable() const {
    114     // If we already have a merged table, it's the first one.
    115     return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin())
    116                                           .template dyn_cast<MergedTable*>();
    117   }
    118 
    119   /// Delete all our current on-disk tables.
    120   void clear() {
    121     for (auto *T : tables())
    122       delete T;
    123     if (auto *M = getMergedTable())
    124       delete M;
    125     Tables.clear();
    126   }
    127 
    128   void removeOverriddenTables() {
    129     llvm::DenseSet<file_type> Files;
    130     Files.insert(PendingOverrides.begin(), PendingOverrides.end());
    131     // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug.
    132     auto ShouldRemove = [&Files](void *T) -> bool {
    133       auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>();
    134       bool Remove = Files.count(ODT->File);
    135       if (Remove)
    136         delete ODT;
    137       return Remove;
    138     };
    139     Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(),
    140                                 ShouldRemove),
    141                  Tables.end());
    142     PendingOverrides.clear();
    143   }
    144 
    145   void condense() {
    146     MergedTable *Merged = getMergedTable();
    147     if (!Merged)
    148       Merged = new MergedTable;
    149 
    150     // Read in all the tables and merge them together.
    151     // FIXME: Be smarter about which tables we merge.
    152     for (auto *ODT : tables()) {
    153       auto &HT = ODT->Table;
    154       Info &InfoObj = HT.getInfoObj();
    155 
    156       for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
    157         auto *LocalPtr = I.getItem();
    158 
    159         // FIXME: Don't rely on the OnDiskHashTable format here.
    160         auto L = InfoObj.ReadKeyDataLength(LocalPtr);
    161         const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
    162         data_type_builder ValueBuilder(Merged->Data[Key]);
    163         InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
    164                              ValueBuilder);
    165       }
    166 
    167       Merged->Files.push_back(ODT->File);
    168       delete ODT;
    169     }
    170 
    171     Tables.clear();
    172     Tables.push_back(Table(Merged).getOpaqueValue());
    173   }
    174 
    175 public:
    176   MultiOnDiskHashTable() = default;
    177 
    178   MultiOnDiskHashTable(MultiOnDiskHashTable &&O)
    179       : Tables(std::move(O.Tables)),
    180         PendingOverrides(std::move(O.PendingOverrides)) {
    181     O.Tables.clear();
    182   }
    183 
    184   MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) {
    185     if (&O == this)
    186       return *this;
    187     clear();
    188     Tables = std::move(O.Tables);
    189     O.Tables.clear();
    190     PendingOverrides = std::move(O.PendingOverrides);
    191     return *this;
    192   }
    193 
    194   ~MultiOnDiskHashTable() { clear(); }
    195 
    196   /// Add the table \p Data loaded from file \p File.
    197   void add(file_type File, storage_type Data, Info InfoObj = Info()) {
    198     using namespace llvm::support;
    199 
    200     storage_type Ptr = Data;
    201 
    202     uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr);
    203 
    204     // Read the list of overridden files.
    205     uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr);
    206     // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
    207     // an additional copy.
    208     llvm::SmallVector<file_type, 16> OverriddenFiles;
    209     OverriddenFiles.reserve(NumFiles);
    210     for (/**/; NumFiles != 0; --NumFiles)
    211       OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr));
    212     PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(),
    213                             OverriddenFiles.end());
    214 
    215     // Read the OnDiskChainedHashTable header.
    216     storage_type Buckets = Data + BucketOffset;
    217     auto NumBucketsAndEntries =
    218         OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets);
    219 
    220     // Register the table.
    221     Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first,
    222                                      NumBucketsAndEntries.second,
    223                                      Buckets, Ptr, Data, std::move(InfoObj));
    224     Tables.push_back(NewTable.getOpaqueValue());
    225   }
    226 
    227   /// Find and read the lookup results for \p EKey.
    228   data_type find(const external_key_type &EKey) {
    229     data_type Result;
    230 
    231     if (!PendingOverrides.empty())
    232       removeOverriddenTables();
    233 
    234     if (Tables.size() > static_cast<unsigned>(Info::MaxTables))
    235       condense();
    236 
    237     internal_key_type Key = Info::GetInternalKey(EKey);
    238     auto KeyHash = Info::ComputeHash(Key);
    239 
    240     if (MergedTable *M = getMergedTable()) {
    241       auto It = M->Data.find(Key);
    242       if (It != M->Data.end())
    243         Result = It->second;
    244     }
    245 
    246     data_type_builder ResultBuilder(Result);
    247 
    248     for (auto *ODT : tables()) {
    249       auto &HT = ODT->Table;
    250       auto It = HT.find_hashed(Key, KeyHash);
    251       if (It != HT.end())
    252         HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(),
    253                                      ResultBuilder);
    254     }
    255 
    256     return Result;
    257   }
    258 
    259   /// Read all the lookup results into a single value. This only makes
    260   /// sense if merging values across keys is meaningful.
    261   data_type findAll() {
    262     data_type Result;
    263     data_type_builder ResultBuilder(Result);
    264 
    265     if (!PendingOverrides.empty())
    266       removeOverriddenTables();
    267 
    268     if (MergedTable *M = getMergedTable()) {
    269       for (auto &KV : M->Data)
    270         Info::MergeDataInto(KV.second, ResultBuilder);
    271     }
    272 
    273     for (auto *ODT : tables()) {
    274       auto &HT = ODT->Table;
    275       Info &InfoObj = HT.getInfoObj();
    276       for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
    277         auto *LocalPtr = I.getItem();
    278 
    279         // FIXME: Don't rely on the OnDiskHashTable format here.
    280         auto L = InfoObj.ReadKeyDataLength(LocalPtr);
    281         const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
    282         InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder);
    283       }
    284     }
    285 
    286     return Result;
    287   }
    288 };
    289 
    290 /// Writer for the on-disk hash table.
    291 template<typename ReaderInfo, typename WriterInfo>
    292 class MultiOnDiskHashTableGenerator {
    293   using BaseTable = MultiOnDiskHashTable<ReaderInfo>;
    294   using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>;
    295 
    296   Generator Gen;
    297 
    298 public:
    299   MultiOnDiskHashTableGenerator() : Gen() {}
    300 
    301   void insert(typename WriterInfo::key_type_ref Key,
    302               typename WriterInfo::data_type_ref Data, WriterInfo &Info) {
    303     Gen.insert(Key, Data, Info);
    304   }
    305 
    306   void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info,
    307             const BaseTable *Base) {
    308     using namespace llvm::support;
    309 
    310     llvm::raw_svector_ostream OutStream(Out);
    311 
    312     // Write our header information.
    313     {
    314       endian::Writer Writer(OutStream, little);
    315 
    316       // Reserve four bytes for the bucket offset.
    317       Writer.write<uint32_t>(0);
    318 
    319       if (auto *Merged = Base ? Base->getMergedTable() : nullptr) {
    320         // Write list of overridden files.
    321         Writer.write<uint32_t>(Merged->Files.size());
    322         for (const auto &F : Merged->Files)
    323           Info.EmitFileRef(OutStream, F);
    324 
    325         // Add all merged entries from Base to the generator.
    326         for (auto &KV : Merged->Data) {
    327           if (!Gen.contains(KV.first, Info))
    328             Gen.insert(KV.first, Info.ImportData(KV.second), Info);
    329         }
    330       } else {
    331         Writer.write<uint32_t>(0);
    332       }
    333     }
    334 
    335     // Write the table itself.
    336     uint32_t BucketOffset = Gen.Emit(OutStream, Info);
    337 
    338     // Replace the first four bytes with the bucket offset.
    339     endian::write32le(Out.data(), BucketOffset);
    340   }
    341 };
    342 
    343 } // namespace serialization
    344 } // namespace clang
    345 
    346 #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
    347