1 1.1 wiz /* 2 1.1 wiz __ __ _ 3 1.1 wiz ___\ \/ /_ __ __ _| |_ 4 1.1 wiz / _ \\ /| '_ \ / _` | __| 5 1.1 wiz | __// \| |_) | (_| | |_ 6 1.1 wiz \___/_/\_\ .__/ \__,_|\__| 7 1.1 wiz |_| XML parser 8 1.1 wiz 9 1.1 wiz Copyright (c) 2022 Mark Brand <markbrand (at) google.com> 10 1.1 wiz Copyright (c) 2025 Sebastian Pipping <sebastian (at) pipping.org> 11 1.1 wiz Licensed under the MIT license: 12 1.1 wiz 13 1.1 wiz Permission is hereby granted, free of charge, to any person obtaining 14 1.1 wiz a copy of this software and associated documentation files (the 15 1.1 wiz "Software"), to deal in the Software without restriction, including 16 1.1 wiz without limitation the rights to use, copy, modify, merge, publish, 17 1.1 wiz distribute, sublicense, and/or sell copies of the Software, and to permit 18 1.1 wiz persons to whom the Software is furnished to do so, subject to the 19 1.1 wiz following conditions: 20 1.1 wiz 21 1.1 wiz The above copyright notice and this permission notice shall be included 22 1.1 wiz in all copies or substantial portions of the Software. 23 1.1 wiz 24 1.1 wiz THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 1.1 wiz EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 1.1 wiz MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 27 1.1 wiz NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 28 1.1 wiz DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 29 1.1 wiz OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 30 1.1 wiz USE OR OTHER DEALINGS IN THE SOFTWARE. 31 1.1 wiz */ 32 1.1 wiz 33 1.1 wiz #if defined(NDEBUG) 34 1.1 wiz # undef NDEBUG // because checks below rely on assert(...) 35 1.1 wiz #endif 36 1.1 wiz 37 1.1 wiz #include <assert.h> 38 1.1 wiz #include <stdint.h> 39 1.1 wiz #include <vector> 40 1.1 wiz 41 1.1 wiz #include "expat.h" 42 1.1 wiz #include "xml_lpm_fuzzer.pb.h" 43 1.1 wiz #include "src/libfuzzer/libfuzzer_macro.h" 44 1.1 wiz 45 1.1 wiz static const char *g_encoding = nullptr; 46 1.1 wiz static const char *g_external_entity = nullptr; 47 1.1 wiz static size_t g_external_entity_size = 0; 48 1.1 wiz 49 1.1 wiz void 50 1.1 wiz SetEncoding(const xml_lpm_fuzzer::Encoding &e) { 51 1.1 wiz switch (e) { 52 1.1 wiz case xml_lpm_fuzzer::Encoding::UTF8: 53 1.1 wiz g_encoding = "UTF-8"; 54 1.1 wiz break; 55 1.1 wiz 56 1.1 wiz case xml_lpm_fuzzer::Encoding::UTF16: 57 1.1 wiz g_encoding = "UTF-16"; 58 1.1 wiz break; 59 1.1 wiz 60 1.1 wiz case xml_lpm_fuzzer::Encoding::ISO88591: 61 1.1 wiz g_encoding = "ISO-8859-1"; 62 1.1 wiz break; 63 1.1 wiz 64 1.1 wiz case xml_lpm_fuzzer::Encoding::ASCII: 65 1.1 wiz g_encoding = "US-ASCII"; 66 1.1 wiz break; 67 1.1 wiz 68 1.1 wiz case xml_lpm_fuzzer::Encoding::NONE: 69 1.1 wiz g_encoding = NULL; 70 1.1 wiz break; 71 1.1 wiz 72 1.1 wiz default: 73 1.1 wiz g_encoding = "UNKNOWN"; 74 1.1 wiz break; 75 1.1 wiz } 76 1.1 wiz } 77 1.1 wiz 78 1.1 wiz static int g_allocation_count = 0; 79 1.1 wiz static std::vector<int> g_fail_allocations = {}; 80 1.1 wiz 81 1.1 wiz void * 82 1.1 wiz MallocHook(size_t size) { 83 1.1 wiz g_allocation_count += 1; 84 1.1 wiz for (auto index : g_fail_allocations) { 85 1.1 wiz if (index == g_allocation_count) { 86 1.1 wiz return NULL; 87 1.1 wiz } 88 1.1 wiz } 89 1.1 wiz return malloc(size); 90 1.1 wiz } 91 1.1 wiz 92 1.1 wiz void * 93 1.1 wiz ReallocHook(void *ptr, size_t size) { 94 1.1 wiz g_allocation_count += 1; 95 1.1 wiz for (auto index : g_fail_allocations) { 96 1.1 wiz if (index == g_allocation_count) { 97 1.1 wiz return NULL; 98 1.1 wiz } 99 1.1 wiz } 100 1.1 wiz return realloc(ptr, size); 101 1.1 wiz } 102 1.1 wiz 103 1.1 wiz void 104 1.1 wiz FreeHook(void *ptr) { 105 1.1 wiz free(ptr); 106 1.1 wiz } 107 1.1 wiz 108 1.1 wiz XML_Memory_Handling_Suite memory_handling_suite 109 1.1 wiz = {MallocHook, ReallocHook, FreeHook}; 110 1.1 wiz 111 1.1 wiz void InitializeParser(XML_Parser parser); 112 1.1 wiz 113 1.1 wiz // We want a parse function that supports resumption, so that we can cover the 114 1.1 wiz // suspend/resume code. 115 1.1 wiz enum XML_Status 116 1.1 wiz Parse(XML_Parser parser, const char *input, int input_len, int is_final) { 117 1.1 wiz enum XML_Status status = XML_Parse(parser, input, input_len, is_final); 118 1.1 wiz while (status == XML_STATUS_SUSPENDED) { 119 1.1 wiz status = XML_ResumeParser(parser); 120 1.1 wiz } 121 1.1 wiz return status; 122 1.1 wiz } 123 1.1 wiz 124 1.1 wiz // When the fuzzer is compiled with instrumentation such as ASan, then the 125 1.1 wiz // accesses in TouchString will fault if they access invalid memory (ie. detect 126 1.1 wiz // either a use-after-free or buffer-overflow). By calling TouchString in each 127 1.1 wiz // of the callbacks, we can check that the arguments meet the API specifications 128 1.1 wiz // in terms of length/null-termination. no_optimize is used to ensure that the 129 1.1 wiz // compiler has to emit actual memory reads, instead of removing them. 130 1.1 wiz static volatile size_t no_optimize = 0; 131 1.1 wiz static void 132 1.1 wiz TouchString(const XML_Char *ptr, int len = -1) { 133 1.1 wiz if (! ptr) { 134 1.1 wiz return; 135 1.1 wiz } 136 1.1 wiz 137 1.1 wiz if (len == -1) { 138 1.1 wiz for (XML_Char value = *ptr++; value; value = *ptr++) { 139 1.1 wiz no_optimize += value; 140 1.1 wiz } 141 1.1 wiz } else { 142 1.1 wiz for (int i = 0; i < len; ++i) { 143 1.1 wiz no_optimize += ptr[i]; 144 1.1 wiz } 145 1.1 wiz } 146 1.1 wiz } 147 1.1 wiz 148 1.1 wiz static void 149 1.1 wiz TouchNodeAndRecurse(XML_Content *content) { 150 1.1 wiz switch (content->type) { 151 1.1 wiz case XML_CTYPE_EMPTY: 152 1.1 wiz case XML_CTYPE_ANY: 153 1.1 wiz assert(content->quant == XML_CQUANT_NONE); 154 1.1 wiz assert(content->name == NULL); 155 1.1 wiz assert(content->numchildren == 0); 156 1.1 wiz assert(content->children == NULL); 157 1.1 wiz break; 158 1.1 wiz 159 1.1 wiz case XML_CTYPE_MIXED: 160 1.1 wiz assert(content->quant == XML_CQUANT_NONE 161 1.1 wiz || content->quant == XML_CQUANT_REP); 162 1.1 wiz assert(content->name == NULL); 163 1.1 wiz for (unsigned int i = 0; i < content->numchildren; ++i) { 164 1.1 wiz assert(content->children[i].type == XML_CTYPE_NAME); 165 1.1 wiz assert(content->children[i].quant == XML_CQUANT_NONE); 166 1.1 wiz assert(content->children[i].numchildren == 0); 167 1.1 wiz assert(content->children[i].children == NULL); 168 1.1 wiz TouchString(content->children[i].name); 169 1.1 wiz } 170 1.1 wiz break; 171 1.1 wiz 172 1.1 wiz case XML_CTYPE_NAME: 173 1.1 wiz assert((content->quant == XML_CQUANT_NONE) 174 1.1 wiz || (content->quant == XML_CQUANT_OPT) 175 1.1 wiz || (content->quant == XML_CQUANT_REP) 176 1.1 wiz || (content->quant == XML_CQUANT_PLUS)); 177 1.1 wiz assert(content->numchildren == 0); 178 1.1 wiz assert(content->children == NULL); 179 1.1 wiz TouchString(content->name); 180 1.1 wiz break; 181 1.1 wiz 182 1.1 wiz case XML_CTYPE_CHOICE: 183 1.1 wiz case XML_CTYPE_SEQ: 184 1.1 wiz assert((content->quant == XML_CQUANT_NONE) 185 1.1 wiz || (content->quant == XML_CQUANT_OPT) 186 1.1 wiz || (content->quant == XML_CQUANT_REP) 187 1.1 wiz || (content->quant == XML_CQUANT_PLUS)); 188 1.1 wiz assert(content->name == NULL); 189 1.1 wiz for (unsigned int i = 0; i < content->numchildren; ++i) { 190 1.1 wiz TouchNodeAndRecurse(&content->children[i]); 191 1.1 wiz } 192 1.1 wiz break; 193 1.1 wiz 194 1.1 wiz default: 195 1.1 wiz assert(false); 196 1.1 wiz } 197 1.1 wiz } 198 1.1 wiz 199 1.1 wiz static void XMLCALL 200 1.1 wiz ElementDeclHandler(void *userData, const XML_Char *name, XML_Content *model) { 201 1.1 wiz TouchString(name); 202 1.1 wiz TouchNodeAndRecurse(model); 203 1.1 wiz XML_FreeContentModel((XML_Parser)userData, model); 204 1.1 wiz } 205 1.1 wiz 206 1.1 wiz static void XMLCALL 207 1.1 wiz AttlistDeclHandler(void *userData, const XML_Char *elname, 208 1.1 wiz const XML_Char *attname, const XML_Char *atttype, 209 1.1 wiz const XML_Char *dflt, int isrequired) { 210 1.1 wiz (void)userData; 211 1.1 wiz TouchString(elname); 212 1.1 wiz TouchString(attname); 213 1.1 wiz TouchString(atttype); 214 1.1 wiz TouchString(dflt); 215 1.1 wiz (void)isrequired; 216 1.1 wiz } 217 1.1 wiz 218 1.1 wiz static void XMLCALL 219 1.1 wiz XmlDeclHandler(void *userData, const XML_Char *version, 220 1.1 wiz const XML_Char *encoding, int standalone) { 221 1.1 wiz (void)userData; 222 1.1 wiz TouchString(version); 223 1.1 wiz TouchString(encoding); 224 1.1 wiz (void)standalone; 225 1.1 wiz } 226 1.1 wiz 227 1.1 wiz static void XMLCALL 228 1.1 wiz StartElementHandler(void *userData, const XML_Char *name, 229 1.1 wiz const XML_Char **atts) { 230 1.1 wiz (void)userData; 231 1.1 wiz TouchString(name); 232 1.1 wiz for (size_t i = 0; atts[i] != NULL; ++i) { 233 1.1 wiz TouchString(atts[i]); 234 1.1 wiz } 235 1.1 wiz } 236 1.1 wiz 237 1.1 wiz static void XMLCALL 238 1.1 wiz EndElementHandler(void *userData, const XML_Char *name) { 239 1.1 wiz (void)userData; 240 1.1 wiz TouchString(name); 241 1.1 wiz } 242 1.1 wiz 243 1.1 wiz static void XMLCALL 244 1.1 wiz CharacterDataHandler(void *userData, const XML_Char *s, int len) { 245 1.1 wiz (void)userData; 246 1.1 wiz TouchString(s, len); 247 1.1 wiz } 248 1.1 wiz 249 1.1 wiz static void XMLCALL 250 1.1 wiz ProcessingInstructionHandler(void *userData, const XML_Char *target, 251 1.1 wiz const XML_Char *data) { 252 1.1 wiz (void)userData; 253 1.1 wiz TouchString(target); 254 1.1 wiz TouchString(data); 255 1.1 wiz } 256 1.1 wiz 257 1.1 wiz static void XMLCALL 258 1.1 wiz CommentHandler(void *userData, const XML_Char *data) { 259 1.1 wiz TouchString(data); 260 1.1 wiz // Use the comment handler to trigger parser suspend, so that we can get 261 1.1 wiz // coverage of that code. 262 1.1 wiz XML_StopParser((XML_Parser)userData, XML_TRUE); 263 1.1 wiz } 264 1.1 wiz 265 1.1 wiz static void XMLCALL 266 1.1 wiz StartCdataSectionHandler(void *userData) { 267 1.1 wiz (void)userData; 268 1.1 wiz } 269 1.1 wiz 270 1.1 wiz static void XMLCALL 271 1.1 wiz EndCdataSectionHandler(void *userData) { 272 1.1 wiz (void)userData; 273 1.1 wiz } 274 1.1 wiz 275 1.1 wiz static void XMLCALL 276 1.1 wiz DefaultHandler(void *userData, const XML_Char *s, int len) { 277 1.1 wiz (void)userData; 278 1.1 wiz TouchString(s, len); 279 1.1 wiz } 280 1.1 wiz 281 1.1 wiz static void XMLCALL 282 1.1 wiz StartDoctypeDeclHandler(void *userData, const XML_Char *doctypeName, 283 1.1 wiz const XML_Char *sysid, const XML_Char *pubid, 284 1.1 wiz int has_internal_subset) { 285 1.1 wiz (void)userData; 286 1.1 wiz TouchString(doctypeName); 287 1.1 wiz TouchString(sysid); 288 1.1 wiz TouchString(pubid); 289 1.1 wiz (void)has_internal_subset; 290 1.1 wiz } 291 1.1 wiz 292 1.1 wiz static void XMLCALL 293 1.1 wiz EndDoctypeDeclHandler(void *userData) { 294 1.1 wiz (void)userData; 295 1.1 wiz } 296 1.1 wiz 297 1.1 wiz static void XMLCALL 298 1.1 wiz EntityDeclHandler(void *userData, const XML_Char *entityName, 299 1.1 wiz int is_parameter_entity, const XML_Char *value, 300 1.1 wiz int value_length, const XML_Char *base, 301 1.1 wiz const XML_Char *systemId, const XML_Char *publicId, 302 1.1 wiz const XML_Char *notationName) { 303 1.1 wiz (void)userData; 304 1.1 wiz TouchString(entityName); 305 1.1 wiz (void)is_parameter_entity; 306 1.1 wiz TouchString(value, value_length); 307 1.1 wiz TouchString(base); 308 1.1 wiz TouchString(systemId); 309 1.1 wiz TouchString(publicId); 310 1.1 wiz TouchString(notationName); 311 1.1 wiz } 312 1.1 wiz 313 1.1 wiz static void XMLCALL 314 1.1 wiz NotationDeclHandler(void *userData, const XML_Char *notationName, 315 1.1 wiz const XML_Char *base, const XML_Char *systemId, 316 1.1 wiz const XML_Char *publicId) { 317 1.1 wiz (void)userData; 318 1.1 wiz TouchString(notationName); 319 1.1 wiz TouchString(base); 320 1.1 wiz TouchString(systemId); 321 1.1 wiz TouchString(publicId); 322 1.1 wiz } 323 1.1 wiz 324 1.1 wiz static void XMLCALL 325 1.1 wiz StartNamespaceDeclHandler(void *userData, const XML_Char *prefix, 326 1.1 wiz const XML_Char *uri) { 327 1.1 wiz (void)userData; 328 1.1 wiz TouchString(prefix); 329 1.1 wiz TouchString(uri); 330 1.1 wiz } 331 1.1 wiz 332 1.1 wiz static void XMLCALL 333 1.1 wiz EndNamespaceDeclHandler(void *userData, const XML_Char *prefix) { 334 1.1 wiz (void)userData; 335 1.1 wiz TouchString(prefix); 336 1.1 wiz } 337 1.1 wiz 338 1.1 wiz static int XMLCALL 339 1.1 wiz NotStandaloneHandler(void *userData) { 340 1.1 wiz (void)userData; 341 1.1 wiz return XML_STATUS_OK; 342 1.1 wiz } 343 1.1 wiz 344 1.1 wiz static int XMLCALL 345 1.1 wiz ExternalEntityRefHandler(XML_Parser parser, const XML_Char *context, 346 1.1 wiz const XML_Char *base, const XML_Char *systemId, 347 1.1 wiz const XML_Char *publicId) { 348 1.1 wiz int rc = XML_STATUS_ERROR; 349 1.1 wiz TouchString(context); 350 1.1 wiz TouchString(base); 351 1.1 wiz TouchString(systemId); 352 1.1 wiz TouchString(publicId); 353 1.1 wiz 354 1.1 wiz if (g_external_entity) { 355 1.1 wiz XML_Parser ext_parser 356 1.1 wiz = XML_ExternalEntityParserCreate(parser, context, g_encoding); 357 1.1 wiz rc = Parse(ext_parser, g_external_entity, g_external_entity_size, 1); 358 1.1 wiz XML_ParserFree(ext_parser); 359 1.1 wiz } 360 1.1 wiz 361 1.1 wiz return rc; 362 1.1 wiz } 363 1.1 wiz 364 1.1 wiz static void XMLCALL 365 1.1 wiz SkippedEntityHandler(void *userData, const XML_Char *entityName, 366 1.1 wiz int is_parameter_entity) { 367 1.1 wiz (void)userData; 368 1.1 wiz TouchString(entityName); 369 1.1 wiz (void)is_parameter_entity; 370 1.1 wiz } 371 1.1 wiz 372 1.1 wiz static int XMLCALL 373 1.1 wiz UnknownEncodingHandler(void *encodingHandlerData, const XML_Char *name, 374 1.1 wiz XML_Encoding *info) { 375 1.1 wiz (void)encodingHandlerData; 376 1.1 wiz TouchString(name); 377 1.1 wiz (void)info; 378 1.1 wiz return XML_STATUS_ERROR; 379 1.1 wiz } 380 1.1 wiz 381 1.1 wiz void 382 1.1 wiz InitializeParser(XML_Parser parser) { 383 1.1 wiz XML_SetUserData(parser, (void *)parser); 384 1.1 wiz XML_SetHashSalt(parser, 0x41414141); 385 1.1 wiz XML_SetParamEntityParsing(parser, XML_PARAM_ENTITY_PARSING_ALWAYS); 386 1.1 wiz 387 1.1 wiz XML_SetElementDeclHandler(parser, ElementDeclHandler); 388 1.1 wiz XML_SetAttlistDeclHandler(parser, AttlistDeclHandler); 389 1.1 wiz XML_SetXmlDeclHandler(parser, XmlDeclHandler); 390 1.1 wiz XML_SetElementHandler(parser, StartElementHandler, EndElementHandler); 391 1.1 wiz XML_SetCharacterDataHandler(parser, CharacterDataHandler); 392 1.1 wiz XML_SetProcessingInstructionHandler(parser, ProcessingInstructionHandler); 393 1.1 wiz XML_SetCommentHandler(parser, CommentHandler); 394 1.1 wiz XML_SetCdataSectionHandler(parser, StartCdataSectionHandler, 395 1.1 wiz EndCdataSectionHandler); 396 1.1 wiz // XML_SetDefaultHandler disables entity expansion 397 1.1 wiz XML_SetDefaultHandlerExpand(parser, DefaultHandler); 398 1.1 wiz XML_SetDoctypeDeclHandler(parser, StartDoctypeDeclHandler, 399 1.1 wiz EndDoctypeDeclHandler); 400 1.1 wiz // Note: This is mutually exclusive with XML_SetUnparsedEntityDeclHandler, 401 1.1 wiz // and there isn't any significant code change between the two. 402 1.1 wiz XML_SetEntityDeclHandler(parser, EntityDeclHandler); 403 1.1 wiz XML_SetNotationDeclHandler(parser, NotationDeclHandler); 404 1.1 wiz XML_SetNamespaceDeclHandler(parser, StartNamespaceDeclHandler, 405 1.1 wiz EndNamespaceDeclHandler); 406 1.1 wiz XML_SetNotStandaloneHandler(parser, NotStandaloneHandler); 407 1.1 wiz XML_SetExternalEntityRefHandler(parser, ExternalEntityRefHandler); 408 1.1 wiz XML_SetSkippedEntityHandler(parser, SkippedEntityHandler); 409 1.1 wiz XML_SetUnknownEncodingHandler(parser, UnknownEncodingHandler, (void *)parser); 410 1.1 wiz } 411 1.1 wiz 412 1.1 wiz DEFINE_TEXT_PROTO_FUZZER(const xml_lpm_fuzzer::Testcase &testcase) { 413 1.1 wiz g_external_entity = nullptr; 414 1.1 wiz 415 1.1 wiz if (! testcase.actions_size()) { 416 1.1 wiz return; 417 1.1 wiz } 418 1.1 wiz 419 1.1 wiz g_allocation_count = 0; 420 1.1 wiz g_fail_allocations.clear(); 421 1.1 wiz for (int i = 0; i < testcase.fail_allocations_size(); ++i) { 422 1.1 wiz g_fail_allocations.push_back(testcase.fail_allocations(i)); 423 1.1 wiz } 424 1.1 wiz 425 1.1 wiz SetEncoding(testcase.encoding()); 426 1.1 wiz XML_Parser parser 427 1.1 wiz = XML_ParserCreate_MM(g_encoding, &memory_handling_suite, "|"); 428 1.1 wiz InitializeParser(parser); 429 1.1 wiz 430 1.1 wiz for (int i = 0; i < testcase.actions_size(); ++i) { 431 1.1 wiz const auto &action = testcase.actions(i); 432 1.1 wiz switch (action.action_case()) { 433 1.1 wiz case xml_lpm_fuzzer::Action::kChunk: 434 1.1 wiz if (XML_STATUS_ERROR 435 1.1 wiz == Parse(parser, action.chunk().data(), action.chunk().size(), 0)) { 436 1.1 wiz // Force a reset after parse error. 437 1.1 wiz XML_ParserReset(parser, g_encoding); 438 1.1 wiz InitializeParser(parser); 439 1.1 wiz } 440 1.1 wiz break; 441 1.1 wiz 442 1.1 wiz case xml_lpm_fuzzer::Action::kLastChunk: 443 1.1 wiz Parse(parser, action.last_chunk().data(), action.last_chunk().size(), 1); 444 1.1 wiz XML_ParserReset(parser, g_encoding); 445 1.1 wiz InitializeParser(parser); 446 1.1 wiz break; 447 1.1 wiz 448 1.1 wiz case xml_lpm_fuzzer::Action::kReset: 449 1.1 wiz XML_ParserReset(parser, g_encoding); 450 1.1 wiz InitializeParser(parser); 451 1.1 wiz break; 452 1.1 wiz 453 1.1 wiz case xml_lpm_fuzzer::Action::kExternalEntity: 454 1.1 wiz g_external_entity = action.external_entity().data(); 455 1.1 wiz g_external_entity_size = action.external_entity().size(); 456 1.1 wiz break; 457 1.1 wiz 458 1.1 wiz default: 459 1.1 wiz break; 460 1.1 wiz } 461 1.1 wiz } 462 1.1 wiz 463 1.1 wiz XML_ParserFree(parser); 464 1.1 wiz } 465