1 1.1 joerg //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===// 2 1.1 joerg // 3 1.1 joerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 1.1 joerg // See https://llvm.org/LICENSE.txt for license information. 5 1.1 joerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 1.1 joerg // 7 1.1 joerg //===----------------------------------------------------------------------===// 8 1.1 joerg // 9 1.1 joerg // This file provides a hash table data structure suitable for incremental and 10 1.1 joerg // distributed storage across a set of files. 11 1.1 joerg // 12 1.1 joerg // Multiple hash tables from different files are implicitly merged to improve 13 1.1 joerg // performance, and on reload the merged table will override those from other 14 1.1 joerg // files. 15 1.1 joerg // 16 1.1 joerg //===----------------------------------------------------------------------===// 17 1.1 joerg 18 1.1 joerg #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 19 1.1 joerg #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 20 1.1 joerg 21 1.1 joerg #include "llvm/ADT/DenseMap.h" 22 1.1 joerg #include "llvm/ADT/DenseSet.h" 23 1.1 joerg #include "llvm/ADT/PointerUnion.h" 24 1.1 joerg #include "llvm/ADT/STLExtras.h" 25 1.1 joerg #include "llvm/ADT/SmallVector.h" 26 1.1 joerg #include "llvm/ADT/TinyPtrVector.h" 27 1.1 joerg #include "llvm/ADT/iterator_range.h" 28 1.1 joerg #include "llvm/Support/Endian.h" 29 1.1 joerg #include "llvm/Support/EndianStream.h" 30 1.1 joerg #include "llvm/Support/OnDiskHashTable.h" 31 1.1 joerg #include "llvm/Support/raw_ostream.h" 32 1.1 joerg #include <algorithm> 33 1.1 joerg #include <cstdint> 34 1.1 joerg #include <vector> 35 1.1 joerg 36 1.1 joerg namespace clang { 37 1.1 joerg namespace serialization { 38 1.1 joerg 39 1.1 joerg /// A collection of on-disk hash tables, merged when relevant for performance. 40 1.1 joerg template<typename Info> class MultiOnDiskHashTable { 41 1.1 joerg public: 42 1.1 joerg /// A handle to a file, used when overriding tables. 43 1.1 joerg using file_type = typename Info::file_type; 44 1.1 joerg 45 1.1 joerg /// A pointer to an on-disk representation of the hash table. 46 1.1 joerg using storage_type = const unsigned char *; 47 1.1 joerg 48 1.1 joerg using external_key_type = typename Info::external_key_type; 49 1.1 joerg using internal_key_type = typename Info::internal_key_type; 50 1.1 joerg using data_type = typename Info::data_type; 51 1.1 joerg using data_type_builder = typename Info::data_type_builder; 52 1.1 joerg using hash_value_type = unsigned; 53 1.1 joerg 54 1.1 joerg private: 55 1.1 joerg /// The generator is permitted to read our merged table. 56 1.1 joerg template<typename ReaderInfo, typename WriterInfo> 57 1.1 joerg friend class MultiOnDiskHashTableGenerator; 58 1.1 joerg 59 1.1 joerg /// A hash table stored on disk. 60 1.1 joerg struct OnDiskTable { 61 1.1 joerg using HashTable = llvm::OnDiskIterableChainedHashTable<Info>; 62 1.1 joerg 63 1.1 joerg file_type File; 64 1.1 joerg HashTable Table; 65 1.1 joerg 66 1.1 joerg OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries, 67 1.1 joerg storage_type Buckets, storage_type Payload, storage_type Base, 68 1.1 joerg const Info &InfoObj) 69 1.1 joerg : File(File), 70 1.1 joerg Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {} 71 1.1 joerg }; 72 1.1 joerg 73 1.1 joerg struct MergedTable { 74 1.1 joerg std::vector<file_type> Files; 75 1.1 joerg llvm::DenseMap<internal_key_type, data_type> Data; 76 1.1 joerg }; 77 1.1 joerg 78 1.1 joerg using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>; 79 1.1 joerg using TableVector = llvm::TinyPtrVector<void *>; 80 1.1 joerg 81 1.1 joerg /// The current set of on-disk and merged tables. 82 1.1 joerg /// We manually store the opaque value of the Table because TinyPtrVector 83 1.1 joerg /// can't cope with holding a PointerUnion directly. 84 1.1 joerg /// There can be at most one MergedTable in this vector, and if present, 85 1.1 joerg /// it is the first table. 86 1.1 joerg TableVector Tables; 87 1.1 joerg 88 1.1 joerg /// Files corresponding to overridden tables that we've not yet 89 1.1 joerg /// discarded. 90 1.1 joerg llvm::TinyPtrVector<file_type> PendingOverrides; 91 1.1 joerg 92 1.1 joerg struct AsOnDiskTable { 93 1.1 joerg using result_type = OnDiskTable *; 94 1.1 joerg 95 1.1 joerg result_type operator()(void *P) const { 96 1.1 joerg return Table::getFromOpaqueValue(P).template get<OnDiskTable *>(); 97 1.1 joerg } 98 1.1 joerg }; 99 1.1 joerg 100 1.1 joerg using table_iterator = 101 1.1 joerg llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>; 102 1.1 joerg using table_range = llvm::iterator_range<table_iterator>; 103 1.1 joerg 104 1.1 joerg /// The current set of on-disk tables. 105 1.1 joerg table_range tables() { 106 1.1 joerg auto Begin = Tables.begin(), End = Tables.end(); 107 1.1 joerg if (getMergedTable()) 108 1.1 joerg ++Begin; 109 1.1 joerg return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()), 110 1.1 joerg llvm::map_iterator(End, AsOnDiskTable())); 111 1.1 joerg } 112 1.1 joerg 113 1.1 joerg MergedTable *getMergedTable() const { 114 1.1 joerg // If we already have a merged table, it's the first one. 115 1.1 joerg return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin()) 116 1.1 joerg .template dyn_cast<MergedTable*>(); 117 1.1 joerg } 118 1.1 joerg 119 1.1 joerg /// Delete all our current on-disk tables. 120 1.1 joerg void clear() { 121 1.1 joerg for (auto *T : tables()) 122 1.1 joerg delete T; 123 1.1 joerg if (auto *M = getMergedTable()) 124 1.1 joerg delete M; 125 1.1 joerg Tables.clear(); 126 1.1 joerg } 127 1.1 joerg 128 1.1 joerg void removeOverriddenTables() { 129 1.1 joerg llvm::DenseSet<file_type> Files; 130 1.1 joerg Files.insert(PendingOverrides.begin(), PendingOverrides.end()); 131 1.1 joerg // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug. 132 1.1 joerg auto ShouldRemove = [&Files](void *T) -> bool { 133 1.1 joerg auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>(); 134 1.1 joerg bool Remove = Files.count(ODT->File); 135 1.1 joerg if (Remove) 136 1.1 joerg delete ODT; 137 1.1 joerg return Remove; 138 1.1 joerg }; 139 1.1 joerg Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(), 140 1.1 joerg ShouldRemove), 141 1.1 joerg Tables.end()); 142 1.1 joerg PendingOverrides.clear(); 143 1.1 joerg } 144 1.1 joerg 145 1.1 joerg void condense() { 146 1.1 joerg MergedTable *Merged = getMergedTable(); 147 1.1 joerg if (!Merged) 148 1.1 joerg Merged = new MergedTable; 149 1.1 joerg 150 1.1 joerg // Read in all the tables and merge them together. 151 1.1 joerg // FIXME: Be smarter about which tables we merge. 152 1.1 joerg for (auto *ODT : tables()) { 153 1.1 joerg auto &HT = ODT->Table; 154 1.1 joerg Info &InfoObj = HT.getInfoObj(); 155 1.1 joerg 156 1.1 joerg for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { 157 1.1 joerg auto *LocalPtr = I.getItem(); 158 1.1 joerg 159 1.1 joerg // FIXME: Don't rely on the OnDiskHashTable format here. 160 1.1 joerg auto L = InfoObj.ReadKeyDataLength(LocalPtr); 161 1.1 joerg const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); 162 1.1 joerg data_type_builder ValueBuilder(Merged->Data[Key]); 163 1.1 joerg InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, 164 1.1 joerg ValueBuilder); 165 1.1 joerg } 166 1.1 joerg 167 1.1 joerg Merged->Files.push_back(ODT->File); 168 1.1 joerg delete ODT; 169 1.1 joerg } 170 1.1 joerg 171 1.1 joerg Tables.clear(); 172 1.1 joerg Tables.push_back(Table(Merged).getOpaqueValue()); 173 1.1 joerg } 174 1.1 joerg 175 1.1 joerg public: 176 1.1 joerg MultiOnDiskHashTable() = default; 177 1.1 joerg 178 1.1 joerg MultiOnDiskHashTable(MultiOnDiskHashTable &&O) 179 1.1 joerg : Tables(std::move(O.Tables)), 180 1.1 joerg PendingOverrides(std::move(O.PendingOverrides)) { 181 1.1 joerg O.Tables.clear(); 182 1.1 joerg } 183 1.1 joerg 184 1.1 joerg MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) { 185 1.1 joerg if (&O == this) 186 1.1 joerg return *this; 187 1.1 joerg clear(); 188 1.1 joerg Tables = std::move(O.Tables); 189 1.1 joerg O.Tables.clear(); 190 1.1 joerg PendingOverrides = std::move(O.PendingOverrides); 191 1.1 joerg return *this; 192 1.1 joerg } 193 1.1 joerg 194 1.1 joerg ~MultiOnDiskHashTable() { clear(); } 195 1.1 joerg 196 1.1 joerg /// Add the table \p Data loaded from file \p File. 197 1.1 joerg void add(file_type File, storage_type Data, Info InfoObj = Info()) { 198 1.1 joerg using namespace llvm::support; 199 1.1 joerg 200 1.1 joerg storage_type Ptr = Data; 201 1.1 joerg 202 1.1 joerg uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr); 203 1.1 joerg 204 1.1 joerg // Read the list of overridden files. 205 1.1 joerg uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr); 206 1.1 joerg // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make 207 1.1 joerg // an additional copy. 208 1.1 joerg llvm::SmallVector<file_type, 16> OverriddenFiles; 209 1.1 joerg OverriddenFiles.reserve(NumFiles); 210 1.1 joerg for (/**/; NumFiles != 0; --NumFiles) 211 1.1 joerg OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr)); 212 1.1 joerg PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(), 213 1.1 joerg OverriddenFiles.end()); 214 1.1 joerg 215 1.1 joerg // Read the OnDiskChainedHashTable header. 216 1.1 joerg storage_type Buckets = Data + BucketOffset; 217 1.1 joerg auto NumBucketsAndEntries = 218 1.1 joerg OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets); 219 1.1 joerg 220 1.1 joerg // Register the table. 221 1.1 joerg Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first, 222 1.1 joerg NumBucketsAndEntries.second, 223 1.1 joerg Buckets, Ptr, Data, std::move(InfoObj)); 224 1.1 joerg Tables.push_back(NewTable.getOpaqueValue()); 225 1.1 joerg } 226 1.1 joerg 227 1.1 joerg /// Find and read the lookup results for \p EKey. 228 1.1 joerg data_type find(const external_key_type &EKey) { 229 1.1 joerg data_type Result; 230 1.1 joerg 231 1.1 joerg if (!PendingOverrides.empty()) 232 1.1 joerg removeOverriddenTables(); 233 1.1 joerg 234 1.1 joerg if (Tables.size() > static_cast<unsigned>(Info::MaxTables)) 235 1.1 joerg condense(); 236 1.1 joerg 237 1.1 joerg internal_key_type Key = Info::GetInternalKey(EKey); 238 1.1 joerg auto KeyHash = Info::ComputeHash(Key); 239 1.1 joerg 240 1.1 joerg if (MergedTable *M = getMergedTable()) { 241 1.1 joerg auto It = M->Data.find(Key); 242 1.1 joerg if (It != M->Data.end()) 243 1.1 joerg Result = It->second; 244 1.1 joerg } 245 1.1 joerg 246 1.1 joerg data_type_builder ResultBuilder(Result); 247 1.1 joerg 248 1.1 joerg for (auto *ODT : tables()) { 249 1.1 joerg auto &HT = ODT->Table; 250 1.1 joerg auto It = HT.find_hashed(Key, KeyHash); 251 1.1 joerg if (It != HT.end()) 252 1.1 joerg HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(), 253 1.1 joerg ResultBuilder); 254 1.1 joerg } 255 1.1 joerg 256 1.1 joerg return Result; 257 1.1 joerg } 258 1.1 joerg 259 1.1 joerg /// Read all the lookup results into a single value. This only makes 260 1.1 joerg /// sense if merging values across keys is meaningful. 261 1.1 joerg data_type findAll() { 262 1.1 joerg data_type Result; 263 1.1 joerg data_type_builder ResultBuilder(Result); 264 1.1 joerg 265 1.1 joerg if (!PendingOverrides.empty()) 266 1.1 joerg removeOverriddenTables(); 267 1.1 joerg 268 1.1 joerg if (MergedTable *M = getMergedTable()) { 269 1.1 joerg for (auto &KV : M->Data) 270 1.1 joerg Info::MergeDataInto(KV.second, ResultBuilder); 271 1.1 joerg } 272 1.1 joerg 273 1.1 joerg for (auto *ODT : tables()) { 274 1.1 joerg auto &HT = ODT->Table; 275 1.1 joerg Info &InfoObj = HT.getInfoObj(); 276 1.1 joerg for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) { 277 1.1 joerg auto *LocalPtr = I.getItem(); 278 1.1 joerg 279 1.1 joerg // FIXME: Don't rely on the OnDiskHashTable format here. 280 1.1 joerg auto L = InfoObj.ReadKeyDataLength(LocalPtr); 281 1.1 joerg const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first); 282 1.1 joerg InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder); 283 1.1 joerg } 284 1.1 joerg } 285 1.1 joerg 286 1.1 joerg return Result; 287 1.1 joerg } 288 1.1 joerg }; 289 1.1 joerg 290 1.1 joerg /// Writer for the on-disk hash table. 291 1.1 joerg template<typename ReaderInfo, typename WriterInfo> 292 1.1 joerg class MultiOnDiskHashTableGenerator { 293 1.1 joerg using BaseTable = MultiOnDiskHashTable<ReaderInfo>; 294 1.1 joerg using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>; 295 1.1 joerg 296 1.1 joerg Generator Gen; 297 1.1 joerg 298 1.1 joerg public: 299 1.1 joerg MultiOnDiskHashTableGenerator() : Gen() {} 300 1.1 joerg 301 1.1 joerg void insert(typename WriterInfo::key_type_ref Key, 302 1.1 joerg typename WriterInfo::data_type_ref Data, WriterInfo &Info) { 303 1.1 joerg Gen.insert(Key, Data, Info); 304 1.1 joerg } 305 1.1 joerg 306 1.1 joerg void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info, 307 1.1 joerg const BaseTable *Base) { 308 1.1 joerg using namespace llvm::support; 309 1.1 joerg 310 1.1 joerg llvm::raw_svector_ostream OutStream(Out); 311 1.1 joerg 312 1.1 joerg // Write our header information. 313 1.1 joerg { 314 1.1 joerg endian::Writer Writer(OutStream, little); 315 1.1 joerg 316 1.1 joerg // Reserve four bytes for the bucket offset. 317 1.1 joerg Writer.write<uint32_t>(0); 318 1.1 joerg 319 1.1 joerg if (auto *Merged = Base ? Base->getMergedTable() : nullptr) { 320 1.1 joerg // Write list of overridden files. 321 1.1 joerg Writer.write<uint32_t>(Merged->Files.size()); 322 1.1 joerg for (const auto &F : Merged->Files) 323 1.1 joerg Info.EmitFileRef(OutStream, F); 324 1.1 joerg 325 1.1 joerg // Add all merged entries from Base to the generator. 326 1.1 joerg for (auto &KV : Merged->Data) { 327 1.1 joerg if (!Gen.contains(KV.first, Info)) 328 1.1 joerg Gen.insert(KV.first, Info.ImportData(KV.second), Info); 329 1.1 joerg } 330 1.1 joerg } else { 331 1.1 joerg Writer.write<uint32_t>(0); 332 1.1 joerg } 333 1.1 joerg } 334 1.1 joerg 335 1.1 joerg // Write the table itself. 336 1.1 joerg uint32_t BucketOffset = Gen.Emit(OutStream, Info); 337 1.1 joerg 338 1.1 joerg // Replace the first four bytes with the bucket offset. 339 1.1 joerg endian::write32le(Out.data(), BucketOffset); 340 1.1 joerg } 341 1.1 joerg }; 342 1.1 joerg 343 1.1 joerg } // namespace serialization 344 1.1 joerg } // namespace clang 345 1.1 joerg 346 1.1 joerg #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H 347