1 #include <iostream> 2 #include <fstream> 3 #include <sstream> 4 #include <string> 5 #include <unordered_map> 6 #include <cstdint> 7 #include <cassert> 8 9 /* 10 * Page Allocator Data Preprocessor (C++ Version) 11 * 12 * This tool processes real allocation traces (collected via BPF) 13 * and converts them into a format suitable for the PA simulator. 14 * 15 * Supported input formats: 16 * HPA: shard_ind_int,addr_int,nsecs_int,probe,size_int 17 * SEC: process_id,thread_id,thread_name,nsecs_int,_c4,sec_ptr_int,sec_shard_ptr_int,edata_ptr_int,size_int,is_frequent_reuse_int 18 * 19 * Output format (5 columns): 20 * shard_ind_int,operation_index,size_or_alloc_index,nsecs,is_frequent 21 * where: 22 * - shard_ind_int: shard index as integer 23 * - operation_index: 0=alloc, 1=dalloc 24 * - size_or_alloc_index: for alloc operations show bytes, 25 * for dalloc operations show index of corresponding alloc 26 * - nsecs: nanonosec of some monotonic clock 27 * - is_frequent: 1 if frequent reuse allocation, 0 otherwise 28 */ 29 30 enum class TraceFormat { HPA, SEC }; 31 32 struct TraceEvent { 33 int shard_ind; 34 uintptr_t addr; 35 uint64_t nsecs; 36 std::string probe; 37 size_t size; 38 bool is_frequent; 39 }; 40 41 struct AllocationRecord { 42 uintptr_t addr; 43 size_t size; 44 int shard_ind; 45 size_t alloc_index; 46 uint64_t nsecs; 47 }; 48 49 class AllocationTracker { 50 private: 51 std::unordered_map<uintptr_t, AllocationRecord> records_; 52 53 public: 54 void 55 add_allocation(uintptr_t addr, size_t size, int shard_ind, 56 size_t alloc_index, uint64_t nsecs) { 57 records_[addr] = {addr, size, shard_ind, alloc_index, nsecs}; 58 } 59 60 AllocationRecord * 61 find_allocation(uintptr_t addr) { 62 auto it = records_.find(addr); 63 return (it != records_.end()) ? &it->second : nullptr; 64 } 65 66 void 67 remove_allocation(uintptr_t addr) { 68 records_.erase(addr); 69 } 70 71 size_t 72 count() const { 73 return records_.size(); 74 } 75 }; 76 77 class ArenaMapper { 78 private: 79 std::unordered_map<uintptr_t, int> sec_ptr_to_arena_; 80 int next_arena_index_; 81 82 public: 83 ArenaMapper() : next_arena_index_(0) {} 84 85 int 86 get_arena_index(uintptr_t sec_ptr) { 87 if (sec_ptr == 0) { 88 /* Should not be seeing null sec pointer anywhere. Use this as a sanity check.*/ 89 return 0; 90 } 91 92 auto it = sec_ptr_to_arena_.find(sec_ptr); 93 if (it != sec_ptr_to_arena_.end()) { 94 return it->second; 95 } 96 97 /* New sec_ptr, assign next available arena index */ 98 int arena_index = next_arena_index_++; 99 sec_ptr_to_arena_[sec_ptr] = arena_index; 100 return arena_index; 101 } 102 103 size_t 104 arena_count() const { 105 return sec_ptr_to_arena_.size(); 106 } 107 }; 108 109 bool 110 is_alloc_operation(const std::string &probe) { 111 return (probe == "hpa_alloc" || probe == "sec_alloc"); 112 } 113 114 bool 115 is_dalloc_operation(const std::string &probe) { 116 return (probe == "hpa_dalloc" || probe == "sec_dalloc"); 117 } 118 119 bool 120 parse_hpa_line(const std::string &line, TraceEvent &event) { 121 std::istringstream ss(line); 122 std::string token; 123 124 /* Parse shard_ind_int */ 125 if (!std::getline(ss, token, ',')) { 126 return true; 127 } 128 event.shard_ind = std::stoi(token); 129 130 /* Parse addr_int */ 131 if (!std::getline(ss, token, ',')) { 132 return true; 133 } 134 event.addr = std::stoull(token); 135 136 /* Parse nsecs_int */ 137 if (!std::getline(ss, token, ',')) { 138 return true; 139 } 140 event.nsecs = std::stoull(token); 141 142 /* Parse probe */ 143 if (!std::getline(ss, token, ',')) { 144 return true; 145 } 146 event.probe = token; 147 148 /* Parse size_int */ 149 if (!std::getline(ss, token, ',')) { 150 return true; 151 } 152 event.size = std::stoull(token); 153 154 /* HPA format doesn't have is_frequent field, set default */ 155 event.is_frequent = true; 156 157 return false; 158 } 159 160 bool 161 parse_sec_line( 162 const std::string &line, TraceEvent &event, ArenaMapper &arena_mapper) { 163 std::istringstream ss(line); 164 std::string token; 165 166 /* Skip process_id */ 167 if (!std::getline(ss, token, ',')) { 168 return true; 169 } 170 171 /* Skip thread_id */ 172 if (!std::getline(ss, token, ',')) { 173 return true; 174 } 175 176 /* Skip thread_name */ 177 if (!std::getline(ss, token, ',')) { 178 return true; 179 } 180 181 /* Parse nsecs_int */ 182 if (!std::getline(ss, token, ',')) { 183 return true; 184 } 185 event.nsecs = std::stoull(token); 186 187 /* Parse operation */ 188 if (!std::getline(ss, token, ',')) { 189 return true; 190 } 191 192 event.probe = token; 193 194 /* Parse sec_ptr_int (used for arena mapping) */ 195 uintptr_t sec_ptr = 0; 196 if (!std::getline(ss, token, ',')) { 197 return true; 198 } 199 if (!token.empty()) { 200 sec_ptr = std::stoull(token); 201 } 202 203 /* Map sec_ptr to arena index */ 204 event.shard_ind = arena_mapper.get_arena_index(sec_ptr); 205 206 /* Skip sec_shard_ptr_int */ 207 if (!std::getline(ss, token, ',')) { 208 return true; 209 } 210 211 /* Parse edata_ptr_int (used as the address) */ 212 if (!std::getline(ss, token, ',')) { 213 return true; 214 } 215 if (!token.empty()) { 216 event.addr = std::stoull(token); 217 } else { 218 event.addr = 0; 219 } 220 221 /* Parse size_int */ 222 if (!std::getline(ss, token, ',') 223 && !is_dalloc_operation(event.probe)) { 224 /* SEC format may not always have size for dalloc */ 225 return true; 226 } 227 if (!token.empty()) { 228 event.size = std::stoull(token); 229 } else { 230 /* When no size given, this is a dalloc, size won't be used. */ 231 event.size = 0; 232 } 233 234 /* Parse is_frequent_reuse_int */ 235 if (!std::getline(ss, token, ',') 236 && !is_dalloc_operation(event.probe)) { 237 return true; 238 } 239 if (!token.empty()) { 240 event.is_frequent = (std::stoi(token) != 0); 241 } else { 242 /* 243 * When no is_frequent_reuse_int given, this is a dalloc, 244 * is_frequent won't be used. 245 */ 246 event.is_frequent = false; 247 } 248 249 return false; 250 } 251 252 void 253 write_output_header(std::ofstream &output) { 254 output << "shard_ind,operation,size_or_alloc_index,nsecs,is_frequent\n"; 255 } 256 257 void 258 write_output_event(std::ofstream &output, int shard_ind, int operation, 259 size_t value, uint64_t nsecs, bool is_frequent) { 260 output << shard_ind << "," << operation << "," << value << "," << nsecs 261 << "," << (is_frequent ? 1 : 0) << "\n"; 262 } 263 264 size_t 265 process_trace_file(const std::string &input_filename, 266 const std::string &output_filename, TraceFormat format) { 267 std::ifstream input(input_filename); 268 if (!input.is_open()) { 269 std::cerr << "Failed to open input file: " << input_filename 270 << std::endl; 271 return 0; 272 } 273 274 std::ofstream output(output_filename); 275 if (!output.is_open()) { 276 std::cerr << "Failed to open output file: " << output_filename 277 << std::endl; 278 return 0; 279 } 280 281 AllocationTracker tracker; 282 ArenaMapper arena_mapper; /* For SEC format arena mapping */ 283 284 std::string line; 285 size_t line_count = 0; 286 size_t output_count = 0; 287 size_t alloc_sequence = 0; /* Sequential index for allocations */ 288 size_t unmatched_frees = 0; 289 290 write_output_header(output); 291 std::cout << "Reading from: " << input_filename << std::endl; 292 293 /* Skip header line */ 294 if (!std::getline(input, line)) { 295 std::cerr << "Error: Empty input file" << std::endl; 296 return 0; 297 } 298 299 while (std::getline(input, line)) { 300 line_count++; 301 302 /* Skip empty lines */ 303 if (line.empty()) { 304 continue; 305 } 306 307 TraceEvent event; 308 bool parse_error = false; 309 310 if (format == TraceFormat::HPA) { 311 parse_error = parse_hpa_line(line, event); 312 } else if (format == TraceFormat::SEC) { 313 parse_error = parse_sec_line(line, event, arena_mapper); 314 } 315 316 if (parse_error) { 317 continue; 318 } 319 320 if (is_alloc_operation(event.probe)) { 321 /* This is an allocation */ 322 write_output_event(output, event.shard_ind, 0, 323 event.size, event.nsecs, event.is_frequent); 324 325 /* Track this allocation with the current sequence number */ 326 tracker.add_allocation(event.addr, event.size, 327 event.shard_ind, alloc_sequence, event.nsecs); 328 alloc_sequence++; 329 } else if (is_dalloc_operation(event.probe)) { 330 /* This is a deallocation. Ignore dalloc without a corresponding alloc. */ 331 AllocationRecord *record = tracker.find_allocation( 332 event.addr); 333 334 if (record) { 335 /* Validate timing: deallocation should happen after allocation */ 336 assert(event.nsecs >= record->nsecs); 337 /* Found matching allocation with valid timing */ 338 write_output_event(output, event.shard_ind, 1, 339 record->alloc_index, event.nsecs, 340 event.is_frequent); 341 tracker.remove_allocation(event.addr); 342 output_count++; /* Count this deallocation */ 343 } else { 344 unmatched_frees++; 345 } 346 } else { 347 std::cerr << "Unknown operation: " << event.probe 348 << std::endl; 349 } 350 } 351 352 std::cout << "Processed " << line_count << " lines" << std::endl; 353 std::cout << "Unmatched frees: " << unmatched_frees << std::endl; 354 std::cout << "Extracted " << output_count << " alloc/dalloc pairs" 355 << std::endl; 356 std::cout << "Results written to: " << output_filename << std::endl; 357 358 return output_count; 359 } 360 361 TraceFormat 362 parse_format(const std::string &format_str) { 363 if (format_str == "hpa") { 364 return TraceFormat::HPA; 365 } else if (format_str == "sec") { 366 return TraceFormat::SEC; 367 } else { 368 throw std::invalid_argument( 369 "Unknown format: " + format_str + ". Use 'hpa' or 'sec'"); 370 } 371 } 372 373 int 374 main(int argc, char *argv[]) { 375 if (argc < 4 || argc > 5) { 376 std::cerr << "Usage: " << argv[0] 377 << " <format> <input_csv_file> <output_file>" 378 << std::endl; 379 std::cerr << std::endl; 380 std::cerr << "Arguments:" << std::endl; 381 std::cerr << " format - Input format: 'hpa' or 'sec'" 382 << std::endl; 383 std::cerr 384 << " hpa: shard_ind_int,addr_int,nsecs_int,probe,size_int" 385 << std::endl; 386 std::cerr 387 << " sec: process_id,thread_id,thread_name,nsecs_int,_c4,sec_ptr_int,sec_shard_ptr_int,edata_ptr_int,size_int,is_frequent_reuse_int" 388 << std::endl; 389 std::cerr << " input_csv_file - Input CSV trace file" 390 << std::endl; 391 std::cerr 392 << " output_file - Output file for simulator with format:" 393 << std::endl; 394 std::cerr 395 << " shard_ind,operation,size_or_alloc_index,nsecs,is_frequent" 396 << std::endl; 397 std::cerr << std::endl; 398 std::cerr << "Output format:" << std::endl; 399 std::cerr << " - operation: 0=alloc, 1=dalloc" << std::endl; 400 std::cerr 401 << " - size_or_alloc_index: bytes for alloc, alloc index for dalloc" 402 << std::endl; 403 return 1; 404 } 405 406 try { 407 TraceFormat format = parse_format(argv[1]); 408 std::string input_file = argv[2]; 409 std::string output_file = argv[3]; 410 411 size_t events_generated = process_trace_file( 412 input_file, output_file, format); 413 414 if (events_generated == 0) { 415 std::cerr 416 << "No events generated. Check input file format and filtering criteria." 417 << std::endl; 418 return 1; 419 } 420 return 0; 421 } catch (const std::exception &e) { 422 std::cerr << "Error: " << e.what() << std::endl; 423 return 1; 424 } 425 } 426