MultiOnDiskHashTable.h revision 1.1 1 1.1 joerg //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===//
2 1.1 joerg //
3 1.1 joerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 1.1 joerg // See https://llvm.org/LICENSE.txt for license information.
5 1.1 joerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 1.1 joerg //
7 1.1 joerg //===----------------------------------------------------------------------===//
8 1.1 joerg //
9 1.1 joerg // This file provides a hash table data structure suitable for incremental and
10 1.1 joerg // distributed storage across a set of files.
11 1.1 joerg //
12 1.1 joerg // Multiple hash tables from different files are implicitly merged to improve
13 1.1 joerg // performance, and on reload the merged table will override those from other
14 1.1 joerg // files.
15 1.1 joerg //
16 1.1 joerg //===----------------------------------------------------------------------===//
17 1.1 joerg
18 1.1 joerg #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
19 1.1 joerg #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
20 1.1 joerg
21 1.1 joerg #include "llvm/ADT/DenseMap.h"
22 1.1 joerg #include "llvm/ADT/DenseSet.h"
23 1.1 joerg #include "llvm/ADT/PointerUnion.h"
24 1.1 joerg #include "llvm/ADT/STLExtras.h"
25 1.1 joerg #include "llvm/ADT/SmallVector.h"
26 1.1 joerg #include "llvm/ADT/TinyPtrVector.h"
27 1.1 joerg #include "llvm/ADT/iterator_range.h"
28 1.1 joerg #include "llvm/Support/Endian.h"
29 1.1 joerg #include "llvm/Support/EndianStream.h"
30 1.1 joerg #include "llvm/Support/OnDiskHashTable.h"
31 1.1 joerg #include "llvm/Support/raw_ostream.h"
32 1.1 joerg #include <algorithm>
33 1.1 joerg #include <cstdint>
34 1.1 joerg #include <vector>
35 1.1 joerg
36 1.1 joerg namespace clang {
37 1.1 joerg namespace serialization {
38 1.1 joerg
39 1.1 joerg /// A collection of on-disk hash tables, merged when relevant for performance.
40 1.1 joerg template<typename Info> class MultiOnDiskHashTable {
41 1.1 joerg public:
42 1.1 joerg /// A handle to a file, used when overriding tables.
43 1.1 joerg using file_type = typename Info::file_type;
44 1.1 joerg
45 1.1 joerg /// A pointer to an on-disk representation of the hash table.
46 1.1 joerg using storage_type = const unsigned char *;
47 1.1 joerg
48 1.1 joerg using external_key_type = typename Info::external_key_type;
49 1.1 joerg using internal_key_type = typename Info::internal_key_type;
50 1.1 joerg using data_type = typename Info::data_type;
51 1.1 joerg using data_type_builder = typename Info::data_type_builder;
52 1.1 joerg using hash_value_type = unsigned;
53 1.1 joerg
54 1.1 joerg private:
55 1.1 joerg /// The generator is permitted to read our merged table.
56 1.1 joerg template<typename ReaderInfo, typename WriterInfo>
57 1.1 joerg friend class MultiOnDiskHashTableGenerator;
58 1.1 joerg
59 1.1 joerg /// A hash table stored on disk.
60 1.1 joerg struct OnDiskTable {
61 1.1 joerg using HashTable = llvm::OnDiskIterableChainedHashTable<Info>;
62 1.1 joerg
63 1.1 joerg file_type File;
64 1.1 joerg HashTable Table;
65 1.1 joerg
66 1.1 joerg OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries,
67 1.1 joerg storage_type Buckets, storage_type Payload, storage_type Base,
68 1.1 joerg const Info &InfoObj)
69 1.1 joerg : File(File),
70 1.1 joerg Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {}
71 1.1 joerg };
72 1.1 joerg
73 1.1 joerg struct MergedTable {
74 1.1 joerg std::vector<file_type> Files;
75 1.1 joerg llvm::DenseMap<internal_key_type, data_type> Data;
76 1.1 joerg };
77 1.1 joerg
78 1.1 joerg using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>;
79 1.1 joerg using TableVector = llvm::TinyPtrVector<void *>;
80 1.1 joerg
81 1.1 joerg /// The current set of on-disk and merged tables.
82 1.1 joerg /// We manually store the opaque value of the Table because TinyPtrVector
83 1.1 joerg /// can't cope with holding a PointerUnion directly.
84 1.1 joerg /// There can be at most one MergedTable in this vector, and if present,
85 1.1 joerg /// it is the first table.
86 1.1 joerg TableVector Tables;
87 1.1 joerg
88 1.1 joerg /// Files corresponding to overridden tables that we've not yet
89 1.1 joerg /// discarded.
90 1.1 joerg llvm::TinyPtrVector<file_type> PendingOverrides;
91 1.1 joerg
92 1.1 joerg struct AsOnDiskTable {
93 1.1 joerg using result_type = OnDiskTable *;
94 1.1 joerg
95 1.1 joerg result_type operator()(void *P) const {
96 1.1 joerg return Table::getFromOpaqueValue(P).template get<OnDiskTable *>();
97 1.1 joerg }
98 1.1 joerg };
99 1.1 joerg
100 1.1 joerg using table_iterator =
101 1.1 joerg llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>;
102 1.1 joerg using table_range = llvm::iterator_range<table_iterator>;
103 1.1 joerg
104 1.1 joerg /// The current set of on-disk tables.
105 1.1 joerg table_range tables() {
106 1.1 joerg auto Begin = Tables.begin(), End = Tables.end();
107 1.1 joerg if (getMergedTable())
108 1.1 joerg ++Begin;
109 1.1 joerg return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()),
110 1.1 joerg llvm::map_iterator(End, AsOnDiskTable()));
111 1.1 joerg }
112 1.1 joerg
113 1.1 joerg MergedTable *getMergedTable() const {
114 1.1 joerg // If we already have a merged table, it's the first one.
115 1.1 joerg return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin())
116 1.1 joerg .template dyn_cast<MergedTable*>();
117 1.1 joerg }
118 1.1 joerg
119 1.1 joerg /// Delete all our current on-disk tables.
120 1.1 joerg void clear() {
121 1.1 joerg for (auto *T : tables())
122 1.1 joerg delete T;
123 1.1 joerg if (auto *M = getMergedTable())
124 1.1 joerg delete M;
125 1.1 joerg Tables.clear();
126 1.1 joerg }
127 1.1 joerg
128 1.1 joerg void removeOverriddenTables() {
129 1.1 joerg llvm::DenseSet<file_type> Files;
130 1.1 joerg Files.insert(PendingOverrides.begin(), PendingOverrides.end());
131 1.1 joerg // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug.
132 1.1 joerg auto ShouldRemove = [&Files](void *T) -> bool {
133 1.1 joerg auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>();
134 1.1 joerg bool Remove = Files.count(ODT->File);
135 1.1 joerg if (Remove)
136 1.1 joerg delete ODT;
137 1.1 joerg return Remove;
138 1.1 joerg };
139 1.1 joerg Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(),
140 1.1 joerg ShouldRemove),
141 1.1 joerg Tables.end());
142 1.1 joerg PendingOverrides.clear();
143 1.1 joerg }
144 1.1 joerg
145 1.1 joerg void condense() {
146 1.1 joerg MergedTable *Merged = getMergedTable();
147 1.1 joerg if (!Merged)
148 1.1 joerg Merged = new MergedTable;
149 1.1 joerg
150 1.1 joerg // Read in all the tables and merge them together.
151 1.1 joerg // FIXME: Be smarter about which tables we merge.
152 1.1 joerg for (auto *ODT : tables()) {
153 1.1 joerg auto &HT = ODT->Table;
154 1.1 joerg Info &InfoObj = HT.getInfoObj();
155 1.1 joerg
156 1.1 joerg for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
157 1.1 joerg auto *LocalPtr = I.getItem();
158 1.1 joerg
159 1.1 joerg // FIXME: Don't rely on the OnDiskHashTable format here.
160 1.1 joerg auto L = InfoObj.ReadKeyDataLength(LocalPtr);
161 1.1 joerg const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
162 1.1 joerg data_type_builder ValueBuilder(Merged->Data[Key]);
163 1.1 joerg InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
164 1.1 joerg ValueBuilder);
165 1.1 joerg }
166 1.1 joerg
167 1.1 joerg Merged->Files.push_back(ODT->File);
168 1.1 joerg delete ODT;
169 1.1 joerg }
170 1.1 joerg
171 1.1 joerg Tables.clear();
172 1.1 joerg Tables.push_back(Table(Merged).getOpaqueValue());
173 1.1 joerg }
174 1.1 joerg
175 1.1 joerg public:
176 1.1 joerg MultiOnDiskHashTable() = default;
177 1.1 joerg
178 1.1 joerg MultiOnDiskHashTable(MultiOnDiskHashTable &&O)
179 1.1 joerg : Tables(std::move(O.Tables)),
180 1.1 joerg PendingOverrides(std::move(O.PendingOverrides)) {
181 1.1 joerg O.Tables.clear();
182 1.1 joerg }
183 1.1 joerg
184 1.1 joerg MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) {
185 1.1 joerg if (&O == this)
186 1.1 joerg return *this;
187 1.1 joerg clear();
188 1.1 joerg Tables = std::move(O.Tables);
189 1.1 joerg O.Tables.clear();
190 1.1 joerg PendingOverrides = std::move(O.PendingOverrides);
191 1.1 joerg return *this;
192 1.1 joerg }
193 1.1 joerg
194 1.1 joerg ~MultiOnDiskHashTable() { clear(); }
195 1.1 joerg
196 1.1 joerg /// Add the table \p Data loaded from file \p File.
197 1.1 joerg void add(file_type File, storage_type Data, Info InfoObj = Info()) {
198 1.1 joerg using namespace llvm::support;
199 1.1 joerg
200 1.1 joerg storage_type Ptr = Data;
201 1.1 joerg
202 1.1 joerg uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr);
203 1.1 joerg
204 1.1 joerg // Read the list of overridden files.
205 1.1 joerg uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr);
206 1.1 joerg // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
207 1.1 joerg // an additional copy.
208 1.1 joerg llvm::SmallVector<file_type, 16> OverriddenFiles;
209 1.1 joerg OverriddenFiles.reserve(NumFiles);
210 1.1 joerg for (/**/; NumFiles != 0; --NumFiles)
211 1.1 joerg OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr));
212 1.1 joerg PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(),
213 1.1 joerg OverriddenFiles.end());
214 1.1 joerg
215 1.1 joerg // Read the OnDiskChainedHashTable header.
216 1.1 joerg storage_type Buckets = Data + BucketOffset;
217 1.1 joerg auto NumBucketsAndEntries =
218 1.1 joerg OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets);
219 1.1 joerg
220 1.1 joerg // Register the table.
221 1.1 joerg Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first,
222 1.1 joerg NumBucketsAndEntries.second,
223 1.1 joerg Buckets, Ptr, Data, std::move(InfoObj));
224 1.1 joerg Tables.push_back(NewTable.getOpaqueValue());
225 1.1 joerg }
226 1.1 joerg
227 1.1 joerg /// Find and read the lookup results for \p EKey.
228 1.1 joerg data_type find(const external_key_type &EKey) {
229 1.1 joerg data_type Result;
230 1.1 joerg
231 1.1 joerg if (!PendingOverrides.empty())
232 1.1 joerg removeOverriddenTables();
233 1.1 joerg
234 1.1 joerg if (Tables.size() > static_cast<unsigned>(Info::MaxTables))
235 1.1 joerg condense();
236 1.1 joerg
237 1.1 joerg internal_key_type Key = Info::GetInternalKey(EKey);
238 1.1 joerg auto KeyHash = Info::ComputeHash(Key);
239 1.1 joerg
240 1.1 joerg if (MergedTable *M = getMergedTable()) {
241 1.1 joerg auto It = M->Data.find(Key);
242 1.1 joerg if (It != M->Data.end())
243 1.1 joerg Result = It->second;
244 1.1 joerg }
245 1.1 joerg
246 1.1 joerg data_type_builder ResultBuilder(Result);
247 1.1 joerg
248 1.1 joerg for (auto *ODT : tables()) {
249 1.1 joerg auto &HT = ODT->Table;
250 1.1 joerg auto It = HT.find_hashed(Key, KeyHash);
251 1.1 joerg if (It != HT.end())
252 1.1 joerg HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(),
253 1.1 joerg ResultBuilder);
254 1.1 joerg }
255 1.1 joerg
256 1.1 joerg return Result;
257 1.1 joerg }
258 1.1 joerg
259 1.1 joerg /// Read all the lookup results into a single value. This only makes
260 1.1 joerg /// sense if merging values across keys is meaningful.
261 1.1 joerg data_type findAll() {
262 1.1 joerg data_type Result;
263 1.1 joerg data_type_builder ResultBuilder(Result);
264 1.1 joerg
265 1.1 joerg if (!PendingOverrides.empty())
266 1.1 joerg removeOverriddenTables();
267 1.1 joerg
268 1.1 joerg if (MergedTable *M = getMergedTable()) {
269 1.1 joerg for (auto &KV : M->Data)
270 1.1 joerg Info::MergeDataInto(KV.second, ResultBuilder);
271 1.1 joerg }
272 1.1 joerg
273 1.1 joerg for (auto *ODT : tables()) {
274 1.1 joerg auto &HT = ODT->Table;
275 1.1 joerg Info &InfoObj = HT.getInfoObj();
276 1.1 joerg for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
277 1.1 joerg auto *LocalPtr = I.getItem();
278 1.1 joerg
279 1.1 joerg // FIXME: Don't rely on the OnDiskHashTable format here.
280 1.1 joerg auto L = InfoObj.ReadKeyDataLength(LocalPtr);
281 1.1 joerg const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
282 1.1 joerg InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder);
283 1.1 joerg }
284 1.1 joerg }
285 1.1 joerg
286 1.1 joerg return Result;
287 1.1 joerg }
288 1.1 joerg };
289 1.1 joerg
290 1.1 joerg /// Writer for the on-disk hash table.
291 1.1 joerg template<typename ReaderInfo, typename WriterInfo>
292 1.1 joerg class MultiOnDiskHashTableGenerator {
293 1.1 joerg using BaseTable = MultiOnDiskHashTable<ReaderInfo>;
294 1.1 joerg using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>;
295 1.1 joerg
296 1.1 joerg Generator Gen;
297 1.1 joerg
298 1.1 joerg public:
299 1.1 joerg MultiOnDiskHashTableGenerator() : Gen() {}
300 1.1 joerg
301 1.1 joerg void insert(typename WriterInfo::key_type_ref Key,
302 1.1 joerg typename WriterInfo::data_type_ref Data, WriterInfo &Info) {
303 1.1 joerg Gen.insert(Key, Data, Info);
304 1.1 joerg }
305 1.1 joerg
306 1.1 joerg void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info,
307 1.1 joerg const BaseTable *Base) {
308 1.1 joerg using namespace llvm::support;
309 1.1 joerg
310 1.1 joerg llvm::raw_svector_ostream OutStream(Out);
311 1.1 joerg
312 1.1 joerg // Write our header information.
313 1.1 joerg {
314 1.1 joerg endian::Writer Writer(OutStream, little);
315 1.1 joerg
316 1.1 joerg // Reserve four bytes for the bucket offset.
317 1.1 joerg Writer.write<uint32_t>(0);
318 1.1 joerg
319 1.1 joerg if (auto *Merged = Base ? Base->getMergedTable() : nullptr) {
320 1.1 joerg // Write list of overridden files.
321 1.1 joerg Writer.write<uint32_t>(Merged->Files.size());
322 1.1 joerg for (const auto &F : Merged->Files)
323 1.1 joerg Info.EmitFileRef(OutStream, F);
324 1.1 joerg
325 1.1 joerg // Add all merged entries from Base to the generator.
326 1.1 joerg for (auto &KV : Merged->Data) {
327 1.1 joerg if (!Gen.contains(KV.first, Info))
328 1.1 joerg Gen.insert(KV.first, Info.ImportData(KV.second), Info);
329 1.1 joerg }
330 1.1 joerg } else {
331 1.1 joerg Writer.write<uint32_t>(0);
332 1.1 joerg }
333 1.1 joerg }
334 1.1 joerg
335 1.1 joerg // Write the table itself.
336 1.1 joerg uint32_t BucketOffset = Gen.Emit(OutStream, Info);
337 1.1 joerg
338 1.1 joerg // Replace the first four bytes with the bucket offset.
339 1.1 joerg endian::write32le(Out.data(), BucketOffset);
340 1.1 joerg }
341 1.1 joerg };
342 1.1 joerg
343 1.1 joerg } // namespace serialization
344 1.1 joerg } // namespace clang
345 1.1 joerg
346 1.1 joerg #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
347