1 /* 2 __ __ _ 3 ___\ \/ /_ __ __ _| |_ 4 / _ \\ /| '_ \ / _` | __| 5 | __// \| |_) | (_| | |_ 6 \___/_/\_\ .__/ \__,_|\__| 7 |_| XML parser 8 9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd 10 Copyright (c) 2000 Clark Cooper <coopercc (at) users.sourceforge.net> 11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake (at) users.sourceforge.net> 12 Copyright (c) 2004-2009 Karl Waclawek <karl (at) waclawek.net> 13 Copyright (c) 2005-2007 Steven Solie <steven (at) solie.ca> 14 Copyright (c) 2016-2025 Sebastian Pipping <sebastian (at) pipping.org> 15 Copyright (c) 2017 Rhodri James <rhodri (at) wildebeest.org.uk> 16 Copyright (c) 2019 David Loffredo <loffredo (at) steptools.com> 17 Copyright (c) 2020 Joe Orton <jorton (at) redhat.com> 18 Copyright (c) 2020 Kleber Tarcsio <klebertarcisio (at) yahoo.com.br> 19 Copyright (c) 2021 Tim Bray <tbray (at) textuality.com> 20 Copyright (c) 2022 Martin Ettl <ettl.martin78 (at) googlemail.com> 21 Copyright (c) 2022 Sean McBride <sean (at) rogue-research.com> 22 Copyright (c) 2025 Alfonso Gregory <gfunni234 (at) gmail.com> 23 Copyright (c) 2026 Matthew Fernandez <matthew.fernandez (at) gmail.com> 24 Licensed under the MIT license: 25 26 Permission is hereby granted, free of charge, to any person obtaining 27 a copy of this software and associated documentation files (the 28 "Software"), to deal in the Software without restriction, including 29 without limitation the rights to use, copy, modify, merge, publish, 30 distribute, sublicense, and/or sell copies of the Software, and to permit 31 persons to whom the Software is furnished to do so, subject to the 32 following conditions: 33 34 The above copyright notice and this permission notice shall be included 35 in all copies or substantial portions of the Software. 36 37 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 38 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 39 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 40 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 41 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 42 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 43 USE OR OTHER DEALINGS IN THE SOFTWARE. 44 */ 45 46 #include "expat_config.h" 47 48 #include <assert.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <stddef.h> 52 #include <string.h> 53 #include <math.h> /* for isnan */ 54 #include <errno.h> 55 56 #include "expat.h" 57 #include "codepage.h" 58 #include "internal.h" /* for UNUSED_P only */ 59 #include "xmlfile.h" 60 #include "xmltchar.h" 61 62 #ifdef _MSC_VER 63 # include <crtdbg.h> 64 #endif 65 66 #ifdef XML_UNICODE 67 # include <wchar.h> 68 #endif 69 70 enum ExitCode { 71 XMLWF_EXIT_SUCCESS = 0, 72 XMLWF_EXIT_INTERNAL_ERROR = 1, 73 XMLWF_EXIT_NOT_WELLFORMED = 2, 74 XMLWF_EXIT_OUTPUT_ERROR = 3, 75 XMLWF_EXIT_USAGE_ERROR = 4, 76 }; 77 78 /* Structures for handler user data */ 79 typedef struct NotationList { 80 struct NotationList *next; 81 const XML_Char *notationName; 82 const XML_Char *systemId; 83 const XML_Char *publicId; 84 } NotationList; 85 86 typedef struct xmlwfUserData { 87 FILE *fp; 88 NotationList *notationListHead; 89 const XML_Char *currentDoctypeName; 90 } XmlwfUserData; 91 92 /* This ensures proper sorting. */ 93 94 #define NSSEP T('\001') 95 96 static void XMLCALL 97 characterData(void *userData, const XML_Char *s, int len) { 98 FILE *fp = ((XmlwfUserData *)userData)->fp; 99 for (; len > 0; --len, ++s) { 100 switch (*s) { 101 case T('&'): 102 fputts(T("&"), fp); 103 break; 104 case T('<'): 105 fputts(T("<"), fp); 106 break; 107 case T('>'): 108 fputts(T(">"), fp); 109 break; 110 #ifdef W3C14N 111 case 13: 112 fputts(T("
"), fp); 113 break; 114 #else 115 case T('"'): 116 fputts(T("""), fp); 117 break; 118 case 9: 119 case 10: 120 case 13: 121 ftprintf(fp, T("&#%d;"), *s); 122 break; 123 #endif 124 default: 125 puttc(*s, fp); 126 break; 127 } 128 } 129 } 130 131 static void 132 attributeValue(FILE *fp, const XML_Char *s) { 133 puttc(T('='), fp); 134 puttc(T('"'), fp); 135 assert(s); 136 for (;;) { 137 switch (*s) { 138 case 0: 139 case NSSEP: 140 puttc(T('"'), fp); 141 return; 142 case T('&'): 143 fputts(T("&"), fp); 144 break; 145 case T('<'): 146 fputts(T("<"), fp); 147 break; 148 case T('"'): 149 fputts(T("""), fp); 150 break; 151 #ifdef W3C14N 152 case 9: 153 fputts(T("	"), fp); 154 break; 155 case 10: 156 fputts(T("
"), fp); 157 break; 158 case 13: 159 fputts(T("
"), fp); 160 break; 161 #else 162 case T('>'): 163 fputts(T(">"), fp); 164 break; 165 case 9: 166 case 10: 167 case 13: 168 ftprintf(fp, T("&#%d;"), *s); 169 break; 170 #endif 171 default: 172 puttc(*s, fp); 173 break; 174 } 175 s++; 176 } 177 } 178 179 /* Lexicographically comparing UTF-8 encoded attribute values, 180 is equivalent to lexicographically comparing based on the character number. */ 181 182 static int 183 attcmp(const void *att1, const void *att2) { 184 return tcscmp(*(const XML_Char *const *)att1, *(const XML_Char *const *)att2); 185 } 186 187 static void XMLCALL 188 startElement(void *userData, const XML_Char *name, const XML_Char **atts) { 189 int nAtts; 190 const XML_Char **p; 191 FILE *fp = ((XmlwfUserData *)userData)->fp; 192 puttc(T('<'), fp); 193 fputts(name, fp); 194 195 p = atts; 196 while (*p) 197 ++p; 198 nAtts = (int)((p - atts) >> 1); 199 if (nAtts > 1) 200 qsort(atts, nAtts, sizeof(XML_Char *) * 2, attcmp); 201 while (*atts) { 202 puttc(T(' '), fp); 203 fputts(*atts++, fp); 204 attributeValue(fp, *atts); 205 atts++; 206 } 207 puttc(T('>'), fp); 208 } 209 210 static void XMLCALL 211 endElement(void *userData, const XML_Char *name) { 212 FILE *fp = ((XmlwfUserData *)userData)->fp; 213 puttc(T('<'), fp); 214 puttc(T('/'), fp); 215 fputts(name, fp); 216 puttc(T('>'), fp); 217 } 218 219 static int 220 nsattcmp(const void *p1, const void *p2) { 221 const XML_Char *att1 = *(const XML_Char *const *)p1; 222 const XML_Char *att2 = *(const XML_Char *const *)p2; 223 int sep1 = (tcsrchr(att1, NSSEP) != 0); 224 int sep2 = (tcsrchr(att2, NSSEP) != 0); 225 if (sep1 != sep2) 226 return sep1 - sep2; 227 return tcscmp(att1, att2); 228 } 229 230 static void XMLCALL 231 startElementNS(void *userData, const XML_Char *name, const XML_Char **atts) { 232 int nAtts; 233 int nsi; 234 const XML_Char **p; 235 FILE *fp = ((XmlwfUserData *)userData)->fp; 236 const XML_Char *sep; 237 puttc(T('<'), fp); 238 239 sep = tcsrchr(name, NSSEP); 240 if (sep) { 241 fputts(T("n1:"), fp); 242 fputts(sep + 1, fp); 243 fputts(T(" xmlns:n1"), fp); 244 attributeValue(fp, name); 245 nsi = 2; 246 } else { 247 fputts(name, fp); 248 nsi = 1; 249 } 250 251 p = atts; 252 while (*p) 253 ++p; 254 nAtts = (int)((p - atts) >> 1); 255 if (nAtts > 1) 256 qsort(atts, nAtts, sizeof(XML_Char *) * 2, nsattcmp); 257 while (*atts) { 258 name = *atts++; 259 sep = tcsrchr(name, NSSEP); 260 puttc(T(' '), fp); 261 if (sep) { 262 ftprintf(fp, T("n%d:"), nsi); 263 fputts(sep + 1, fp); 264 } else 265 fputts(name, fp); 266 attributeValue(fp, *atts); 267 if (sep) { 268 ftprintf(fp, T(" xmlns:n%d"), nsi++); 269 attributeValue(fp, name); 270 } 271 atts++; 272 } 273 puttc(T('>'), fp); 274 } 275 276 static void XMLCALL 277 endElementNS(void *userData, const XML_Char *name) { 278 FILE *fp = ((XmlwfUserData *)userData)->fp; 279 const XML_Char *sep; 280 puttc(T('<'), fp); 281 puttc(T('/'), fp); 282 sep = tcsrchr(name, NSSEP); 283 if (sep) { 284 fputts(T("n1:"), fp); 285 fputts(sep + 1, fp); 286 } else 287 fputts(name, fp); 288 puttc(T('>'), fp); 289 } 290 291 #ifndef W3C14N 292 293 static void XMLCALL 294 processingInstruction(void *userData, const XML_Char *target, 295 const XML_Char *data) { 296 FILE *fp = ((XmlwfUserData *)userData)->fp; 297 puttc(T('<'), fp); 298 puttc(T('?'), fp); 299 fputts(target, fp); 300 puttc(T(' '), fp); 301 fputts(data, fp); 302 puttc(T('?'), fp); 303 puttc(T('>'), fp); 304 } 305 306 static XML_Char * 307 xcsdup(const XML_Char *s) { 308 XML_Char *result; 309 int count = 0; 310 size_t numBytes; 311 312 /* Get the length of the string, including terminator */ 313 while (s[count++] != 0) { 314 /* Do nothing */ 315 } 316 numBytes = count * sizeof(XML_Char); 317 result = malloc(numBytes); 318 if (result == NULL) 319 return NULL; 320 memcpy(result, s, numBytes); 321 return result; 322 } 323 324 static void XMLCALL 325 startDoctypeDecl(void *userData, const XML_Char *doctypeName, 326 const XML_Char *sysid, const XML_Char *publid, 327 int has_internal_subset) { 328 XmlwfUserData *data = userData; 329 UNUSED_P(sysid); 330 UNUSED_P(publid); 331 UNUSED_P(has_internal_subset); 332 data->currentDoctypeName = xcsdup(doctypeName); 333 } 334 335 static void 336 freeNotations(XmlwfUserData *data) { 337 NotationList *notationListHead = data->notationListHead; 338 339 while (notationListHead != NULL) { 340 NotationList *next = notationListHead->next; 341 free((void *)notationListHead->notationName); 342 free((void *)notationListHead->systemId); 343 free((void *)notationListHead->publicId); 344 free(notationListHead); 345 notationListHead = next; 346 } 347 data->notationListHead = NULL; 348 } 349 350 static void 351 cleanupUserData(XmlwfUserData *userData) { 352 free((void *)userData->currentDoctypeName); 353 userData->currentDoctypeName = NULL; 354 freeNotations(userData); 355 } 356 357 static int 358 xcscmp(const XML_Char *xs, const XML_Char *xt) { 359 while (*xs != 0 && *xt != 0) { 360 if (*xs < *xt) 361 return -1; 362 if (*xs > *xt) 363 return 1; 364 xs++; 365 xt++; 366 } 367 if (*xs < *xt) 368 return -1; 369 if (*xs > *xt) 370 return 1; 371 return 0; 372 } 373 374 static int 375 notationCmp(const void *a, const void *b) { 376 const NotationList *const n1 = *(const NotationList *const *)a; 377 const NotationList *const n2 = *(const NotationList *const *)b; 378 379 return xcscmp(n1->notationName, n2->notationName); 380 } 381 382 static void XMLCALL 383 endDoctypeDecl(void *userData) { 384 XmlwfUserData *data = userData; 385 NotationList **notations; 386 int notationCount = 0; 387 NotationList *p; 388 int i; 389 390 /* How many notations do we have? */ 391 for (p = data->notationListHead; p != NULL; p = p->next) 392 notationCount++; 393 if (notationCount == 0) { 394 /* Nothing to report */ 395 goto cleanUp; 396 } 397 398 notations = malloc(notationCount * sizeof(NotationList *)); 399 if (notations == NULL) { 400 fprintf(stderr, "Unable to sort notations"); 401 goto cleanUp; 402 } 403 404 for (p = data->notationListHead, i = 0; i < notationCount; p = p->next, i++) { 405 notations[i] = p; 406 } 407 qsort(notations, notationCount, sizeof(NotationList *), notationCmp); 408 409 /* Output the DOCTYPE header */ 410 fputts(T("<!DOCTYPE "), data->fp); 411 fputts(data->currentDoctypeName, data->fp); 412 fputts(T(" [\n"), data->fp); 413 414 /* Now the NOTATIONs */ 415 for (i = 0; i < notationCount; i++) { 416 fputts(T("<!NOTATION "), data->fp); 417 fputts(notations[i]->notationName, data->fp); 418 if (notations[i]->publicId != NULL) { 419 fputts(T(" PUBLIC '"), data->fp); 420 fputts(notations[i]->publicId, data->fp); 421 puttc(T('\''), data->fp); 422 if (notations[i]->systemId != NULL) { 423 puttc(T(' '), data->fp); 424 puttc(T('\''), data->fp); 425 fputts(notations[i]->systemId, data->fp); 426 puttc(T('\''), data->fp); 427 } 428 } else if (notations[i]->systemId != NULL) { 429 fputts(T(" SYSTEM '"), data->fp); 430 fputts(notations[i]->systemId, data->fp); 431 puttc(T('\''), data->fp); 432 } 433 puttc(T('>'), data->fp); 434 puttc(T('\n'), data->fp); 435 } 436 437 /* Finally end the DOCTYPE */ 438 fputts(T("]>\n"), data->fp); 439 440 free(notations); 441 442 cleanUp: 443 freeNotations(data); 444 free((void *)data->currentDoctypeName); 445 data->currentDoctypeName = NULL; 446 } 447 448 static void XMLCALL 449 notationDecl(void *userData, const XML_Char *notationName, const XML_Char *base, 450 const XML_Char *systemId, const XML_Char *publicId) { 451 XmlwfUserData *data = userData; 452 NotationList *entry = malloc(sizeof(NotationList)); 453 const char *errorMessage = "Unable to store NOTATION for output\n"; 454 455 UNUSED_P(base); 456 if (entry == NULL) { 457 fputs(errorMessage, stderr); 458 return; /* Nothing we can really do about this */ 459 } 460 entry->notationName = xcsdup(notationName); 461 if (entry->notationName == NULL) { 462 fputs(errorMessage, stderr); 463 free(entry); 464 return; 465 } 466 if (systemId != NULL) { 467 entry->systemId = xcsdup(systemId); 468 if (entry->systemId == NULL) { 469 fputs(errorMessage, stderr); 470 free((void *)entry->notationName); 471 free(entry); 472 return; 473 } 474 } else { 475 entry->systemId = NULL; 476 } 477 if (publicId != NULL) { 478 entry->publicId = xcsdup(publicId); 479 if (entry->publicId == NULL) { 480 fputs(errorMessage, stderr); 481 free((void *)entry->systemId); /* Safe if it's NULL */ 482 free((void *)entry->notationName); 483 free(entry); 484 return; 485 } 486 } else { 487 entry->publicId = NULL; 488 } 489 490 entry->next = data->notationListHead; 491 data->notationListHead = entry; 492 } 493 494 #endif /* not W3C14N */ 495 496 static void XMLCALL 497 defaultCharacterData(void *userData, const XML_Char *s, int len) { 498 UNUSED_P(s); 499 UNUSED_P(len); 500 XML_DefaultCurrent(userData); 501 } 502 503 static void XMLCALL 504 defaultStartElement(void *userData, const XML_Char *name, 505 const XML_Char **atts) { 506 UNUSED_P(name); 507 UNUSED_P(atts); 508 XML_DefaultCurrent(userData); 509 } 510 511 static void XMLCALL 512 defaultEndElement(void *userData, const XML_Char *name) { 513 UNUSED_P(name); 514 XML_DefaultCurrent(userData); 515 } 516 517 static void XMLCALL 518 defaultProcessingInstruction(void *userData, const XML_Char *target, 519 const XML_Char *data) { 520 UNUSED_P(target); 521 UNUSED_P(data); 522 XML_DefaultCurrent(userData); 523 } 524 525 static void XMLCALL 526 nopCharacterData(void *userData, const XML_Char *s, int len) { 527 UNUSED_P(userData); 528 UNUSED_P(s); 529 UNUSED_P(len); 530 } 531 532 static void XMLCALL 533 nopStartElement(void *userData, const XML_Char *name, const XML_Char **atts) { 534 UNUSED_P(userData); 535 UNUSED_P(name); 536 UNUSED_P(atts); 537 } 538 539 static void XMLCALL 540 nopEndElement(void *userData, const XML_Char *name) { 541 UNUSED_P(userData); 542 UNUSED_P(name); 543 } 544 545 static void XMLCALL 546 nopProcessingInstruction(void *userData, const XML_Char *target, 547 const XML_Char *data) { 548 UNUSED_P(userData); 549 UNUSED_P(target); 550 UNUSED_P(data); 551 } 552 553 static void XMLCALL 554 markup(void *userData, const XML_Char *s, int len) { 555 FILE *fp = ((XmlwfUserData *)XML_GetUserData(userData))->fp; 556 for (; len > 0; --len, ++s) 557 puttc(*s, fp); 558 } 559 560 static void 561 metaLocation(XML_Parser parser) { 562 const XML_Char *uri = XML_GetBase(parser); 563 FILE *fp = ((XmlwfUserData *)XML_GetUserData(parser))->fp; 564 if (uri) 565 ftprintf(fp, T(" uri=\"%s\""), uri); 566 ftprintf(fp, 567 T(" byte=\"%") T(XML_FMT_INT_MOD) T("d\"") T(" nbytes=\"%d\"") 568 T(" line=\"%") T(XML_FMT_INT_MOD) T("u\"") T(" col=\"%") 569 T(XML_FMT_INT_MOD) T("u\""), 570 XML_GetCurrentByteIndex(parser), XML_GetCurrentByteCount(parser), 571 XML_GetCurrentLineNumber(parser), 572 XML_GetCurrentColumnNumber(parser)); 573 } 574 575 static void 576 metaStartDocument(void *userData) { 577 fputts(T("<document>\n"), ((XmlwfUserData *)XML_GetUserData(userData))->fp); 578 } 579 580 static void 581 metaEndDocument(void *userData) { 582 fputts(T("</document>\n"), ((XmlwfUserData *)XML_GetUserData(userData))->fp); 583 } 584 585 static void XMLCALL 586 metaStartElement(void *userData, const XML_Char *name, const XML_Char **atts) { 587 XML_Parser parser = userData; 588 XmlwfUserData *data = XML_GetUserData(parser); 589 FILE *fp = data->fp; 590 const XML_Char **specifiedAttsEnd 591 = atts + XML_GetSpecifiedAttributeCount(parser); 592 const XML_Char **idAttPtr; 593 int idAttIndex = XML_GetIdAttributeIndex(parser); 594 if (idAttIndex < 0) 595 idAttPtr = 0; 596 else 597 idAttPtr = atts + idAttIndex; 598 599 ftprintf(fp, T("<starttag name=\"%s\""), name); 600 metaLocation(parser); 601 if (*atts) { 602 fputts(T(">\n"), fp); 603 do { 604 ftprintf(fp, T("<attribute name=\"%s\" value=\""), atts[0]); 605 characterData(data, atts[1], (int)tcslen(atts[1])); 606 if (atts >= specifiedAttsEnd) 607 fputts(T("\" defaulted=\"yes\"/>\n"), fp); 608 else if (atts == idAttPtr) 609 fputts(T("\" id=\"yes\"/>\n"), fp); 610 else 611 fputts(T("\"/>\n"), fp); 612 } while (*(atts += 2)); 613 fputts(T("</starttag>\n"), fp); 614 } else 615 fputts(T("/>\n"), fp); 616 } 617 618 static void XMLCALL 619 metaEndElement(void *userData, const XML_Char *name) { 620 XML_Parser parser = userData; 621 XmlwfUserData *data = XML_GetUserData(parser); 622 FILE *fp = data->fp; 623 ftprintf(fp, T("<endtag name=\"%s\""), name); 624 metaLocation(parser); 625 fputts(T("/>\n"), fp); 626 } 627 628 static void XMLCALL 629 metaProcessingInstruction(void *userData, const XML_Char *target, 630 const XML_Char *data) { 631 XML_Parser parser = userData; 632 XmlwfUserData *usrData = XML_GetUserData(parser); 633 FILE *fp = usrData->fp; 634 ftprintf(fp, T("<pi target=\"%s\" data=\""), target); 635 characterData(usrData, data, (int)tcslen(data)); 636 puttc(T('"'), fp); 637 metaLocation(parser); 638 fputts(T("/>\n"), fp); 639 } 640 641 static void XMLCALL 642 metaComment(void *userData, const XML_Char *data) { 643 XML_Parser parser = userData; 644 XmlwfUserData *usrData = XML_GetUserData(parser); 645 FILE *fp = usrData->fp; 646 fputts(T("<comment data=\""), fp); 647 characterData(usrData, data, (int)tcslen(data)); 648 puttc(T('"'), fp); 649 metaLocation(parser); 650 fputts(T("/>\n"), fp); 651 } 652 653 static void XMLCALL 654 metaStartCdataSection(void *userData) { 655 XML_Parser parser = userData; 656 XmlwfUserData *data = XML_GetUserData(parser); 657 FILE *fp = data->fp; 658 fputts(T("<startcdata"), fp); 659 metaLocation(parser); 660 fputts(T("/>\n"), fp); 661 } 662 663 static void XMLCALL 664 metaEndCdataSection(void *userData) { 665 XML_Parser parser = userData; 666 XmlwfUserData *data = XML_GetUserData(parser); 667 FILE *fp = data->fp; 668 fputts(T("<endcdata"), fp); 669 metaLocation(parser); 670 fputts(T("/>\n"), fp); 671 } 672 673 static void XMLCALL 674 metaCharacterData(void *userData, const XML_Char *s, int len) { 675 XML_Parser parser = userData; 676 XmlwfUserData *data = XML_GetUserData(parser); 677 FILE *fp = data->fp; 678 fputts(T("<chars str=\""), fp); 679 characterData(data, s, len); 680 puttc(T('"'), fp); 681 metaLocation(parser); 682 fputts(T("/>\n"), fp); 683 } 684 685 static void XMLCALL 686 metaStartDoctypeDecl(void *userData, const XML_Char *doctypeName, 687 const XML_Char *sysid, const XML_Char *pubid, 688 int has_internal_subset) { 689 XML_Parser parser = userData; 690 XmlwfUserData *data = XML_GetUserData(parser); 691 FILE *fp = data->fp; 692 UNUSED_P(sysid); 693 UNUSED_P(pubid); 694 UNUSED_P(has_internal_subset); 695 ftprintf(fp, T("<startdoctype name=\"%s\""), doctypeName); 696 metaLocation(parser); 697 fputts(T("/>\n"), fp); 698 } 699 700 static void XMLCALL 701 metaEndDoctypeDecl(void *userData) { 702 XML_Parser parser = userData; 703 XmlwfUserData *data = XML_GetUserData(parser); 704 FILE *fp = data->fp; 705 fputts(T("<enddoctype"), fp); 706 metaLocation(parser); 707 fputts(T("/>\n"), fp); 708 } 709 710 static void XMLCALL 711 metaNotationDecl(void *userData, const XML_Char *notationName, 712 const XML_Char *base, const XML_Char *systemId, 713 const XML_Char *publicId) { 714 XML_Parser parser = userData; 715 XmlwfUserData *data = XML_GetUserData(parser); 716 FILE *fp = data->fp; 717 UNUSED_P(base); 718 ftprintf(fp, T("<notation name=\"%s\""), notationName); 719 if (publicId) 720 ftprintf(fp, T(" public=\"%s\""), publicId); 721 if (systemId) { 722 fputts(T(" system=\""), fp); 723 characterData(data, systemId, (int)tcslen(systemId)); 724 puttc(T('"'), fp); 725 } 726 metaLocation(parser); 727 fputts(T("/>\n"), fp); 728 } 729 730 static void XMLCALL 731 metaEntityDecl(void *userData, const XML_Char *entityName, int is_param, 732 const XML_Char *value, int value_length, const XML_Char *base, 733 const XML_Char *systemId, const XML_Char *publicId, 734 const XML_Char *notationName) { 735 XML_Parser parser = userData; 736 XmlwfUserData *data = XML_GetUserData(parser); 737 FILE *fp = data->fp; 738 739 UNUSED_P(is_param); 740 UNUSED_P(base); 741 if (value) { 742 ftprintf(fp, T("<entity name=\"%s\""), entityName); 743 metaLocation(parser); 744 puttc(T('>'), fp); 745 characterData(data, value, value_length); 746 fputts(T("</entity/>\n"), fp); 747 } else if (notationName) { 748 ftprintf(fp, T("<entity name=\"%s\""), entityName); 749 if (publicId) 750 ftprintf(fp, T(" public=\"%s\""), publicId); 751 fputts(T(" system=\""), fp); 752 characterData(data, systemId, (int)tcslen(systemId)); 753 puttc(T('"'), fp); 754 ftprintf(fp, T(" notation=\"%s\""), notationName); 755 metaLocation(parser); 756 fputts(T("/>\n"), fp); 757 } else { 758 ftprintf(fp, T("<entity name=\"%s\""), entityName); 759 if (publicId) 760 ftprintf(fp, T(" public=\"%s\""), publicId); 761 fputts(T(" system=\""), fp); 762 characterData(data, systemId, (int)tcslen(systemId)); 763 puttc(T('"'), fp); 764 metaLocation(parser); 765 fputts(T("/>\n"), fp); 766 } 767 } 768 769 static void XMLCALL 770 metaStartNamespaceDecl(void *userData, const XML_Char *prefix, 771 const XML_Char *uri) { 772 XML_Parser parser = userData; 773 XmlwfUserData *data = XML_GetUserData(parser); 774 FILE *fp = data->fp; 775 fputts(T("<startns"), fp); 776 if (prefix) 777 ftprintf(fp, T(" prefix=\"%s\""), prefix); 778 if (uri) { 779 fputts(T(" ns=\""), fp); 780 characterData(data, uri, (int)tcslen(uri)); 781 fputts(T("\"/>\n"), fp); 782 } else 783 fputts(T("/>\n"), fp); 784 } 785 786 static void XMLCALL 787 metaEndNamespaceDecl(void *userData, const XML_Char *prefix) { 788 XML_Parser parser = userData; 789 XmlwfUserData *data = XML_GetUserData(parser); 790 FILE *fp = data->fp; 791 if (! prefix) 792 fputts(T("<endns/>\n"), fp); 793 else 794 ftprintf(fp, T("<endns prefix=\"%s\"/>\n"), prefix); 795 } 796 797 static int XMLCALL 798 unknownEncodingConvert(void *data, const char *p) { 799 return codepageConvert(*(int *)data, p); 800 } 801 802 static int XMLCALL 803 unknownEncoding(void *userData, const XML_Char *name, XML_Encoding *info) { 804 int cp; 805 static const XML_Char prefixL[] = T("windows-"); 806 static const XML_Char prefixU[] = T("WINDOWS-"); 807 int i; 808 809 UNUSED_P(userData); 810 for (i = 0; prefixU[i]; i++) 811 if (name[i] != prefixU[i] && name[i] != prefixL[i]) 812 return 0; 813 814 cp = 0; 815 for (; name[i]; i++) { 816 static const XML_Char digits[] = T("0123456789"); 817 const XML_Char *s = tcschr(digits, name[i]); 818 if (! s) 819 return 0; 820 cp *= 10; 821 cp += (int)(s - digits); 822 if (cp >= 0x10000) 823 return 0; 824 } 825 if (! codepageMap(cp, info->map)) 826 return 0; 827 info->convert = unknownEncodingConvert; 828 /* We could just cast the code page integer to a void *, 829 and avoid the use of release. */ 830 info->release = free; 831 info->data = malloc(sizeof(int)); 832 if (! info->data) 833 return 0; 834 *(int *)info->data = cp; 835 return 1; 836 } 837 838 static int XMLCALL 839 notStandalone(void *userData) { 840 UNUSED_P(userData); 841 return 0; 842 } 843 844 static void 845 showVersion(XML_Char *prog) { 846 XML_Char *s = prog; 847 XML_Char ch; 848 const XML_Feature *features = XML_GetFeatureList(); 849 while ((ch = *s) != 0) { 850 if (ch == '/' 851 #if defined(_WIN32) 852 || ch == '\\' 853 #endif 854 ) 855 prog = s + 1; 856 ++s; 857 } 858 ftprintf(stdout, T("%s using %s\n"), prog, XML_ExpatVersion()); 859 if (features != NULL && features[0].feature != XML_FEATURE_END) { 860 int i = 1; 861 ftprintf(stdout, T("%s"), features[0].name); 862 if (features[0].value) 863 ftprintf(stdout, T("=%ld"), features[0].value); 864 while (features[i].feature != XML_FEATURE_END) { 865 ftprintf(stdout, T(", %s"), features[i].name); 866 if (features[i].value) 867 ftprintf(stdout, T("=%ld"), features[i].value); 868 ++i; 869 } 870 ftprintf(stdout, T("\n")); 871 } 872 } 873 874 #if defined(__GNUC__) 875 __attribute__((noreturn)) 876 #endif 877 static void 878 usage(const XML_Char *prog, int rc) { 879 ftprintf( 880 stderr, 881 /* Generated with: 882 * $ xmlwf/xmlwf_helpgen.sh 883 * To update, change xmlwf/xmlwf_helpgen.py, then paste the output of 884 * xmlwf/xmlwf_helpgen.sh in here. 885 */ 886 /* clang-format off */ 887 T("usage:\n") 888 T(" %s [OPTIONS] [FILE ...]\n") 889 T(" %s -h|--help\n") 890 T(" %s -v|--version\n") 891 T("\n") 892 T("xmlwf - Determines if an XML document is well-formed\n") 893 T("\n") 894 T("positional arguments:\n") 895 T(" FILE file to process (default: STDIN)\n") 896 T("\n") 897 T("input control arguments:\n") 898 T(" -s print an error if the document is not [s]tandalone\n") 899 T(" -n enable [n]amespace processing\n") 900 T(" -p enable processing of external DTDs and [p]arameter entities\n") 901 T(" -x enable processing of e[x]ternal entities\n") 902 T(" (CAREFUL! This makes xmlwf vulnerable to external entity attacks (XXE).)\n") 903 T(" -e ENCODING override any in-document [e]ncoding declaration\n") 904 T(" -w enable support for [W]indows code pages\n") 905 T(" -r disable memory-mapping and use [r]ead calls instead\n") 906 T(" -g BYTES buffer size to request per call pair to XML_[G]etBuffer and read (default: 8 KiB)\n") 907 T(" -k when processing multiple files, [k]eep processing after first file with error\n") 908 T("\n") 909 T("output control arguments:\n") 910 T(" -d DIRECTORY output [d]estination directory\n") 911 T(" -c write a [c]opy of input XML, not canonical XML\n") 912 T(" -m write [m]eta XML, not canonical XML\n") 913 T(" -t write no XML output for [t]iming of plain parsing\n") 914 T(" -N enable adding doctype and [n]otation declarations\n") 915 T("\n") 916 T("amplification attack protection (e.g. billion laughs):\n") 917 T(" NOTE: If you ever need to increase these values for non-attack payload, please file a bug report.\n") 918 T("\n") 919 T(" -a FACTOR set maximum tolerated [a]mplification factor (default: 100.0)\n") 920 T(" -b BYTES set number of output [b]ytes needed to activate (default: 8 MiB/64 MiB)\n") 921 T("\n") 922 T("reparse deferral:\n") 923 T(" -q disable reparse deferral, and allow [q]uadratic parse runtime with large tokens\n") 924 T("\n") 925 T("info arguments:\n") 926 T(" -h, --help show this [h]elp message and exit\n") 927 T(" -v, --version show program's [v]ersion number and exit\n") 928 T("\n") 929 T("environment variables:\n") 930 T(" EXPAT_ACCOUNTING_DEBUG=(0|1|2|3)\n") 931 T(" Control verbosity of accounting debugging (default: 0)\n") 932 T(" EXPAT_ENTITY_DEBUG=(0|1)\n") 933 T(" Control verbosity of entity debugging (default: 0)\n") 934 T(" EXPAT_ENTROPY_DEBUG=(0|1)\n") 935 T(" Control verbosity of entropy debugging (default: 0)\n") 936 T(" EXPAT_MALLOC_DEBUG=(0|1|2)\n") 937 T(" Control verbosity of allocation tracker (default: 0)\n") 938 T("\n") 939 T("exit status:\n") 940 T(" 0 the input files are well-formed and the output (if requested) was written successfully\n") 941 T(" 1 could not allocate data structures, signals a serious problem with execution environment\n") 942 T(" 2 one or more input files were not well-formed\n") 943 T(" 3 could not create an output file\n") 944 T(" 4 command-line argument error\n") 945 T("\n") 946 T("xmlwf of libexpat is software libre, licensed under the MIT license.\n") 947 T("Please report bugs at https://github.com/libexpat/libexpat/issues -- thank you!\n") 948 , /* clang-format on */ 949 prog, prog, prog); 950 exit(rc); 951 } 952 953 #if defined(__MINGW32__) && defined(XML_UNICODE) 954 /* Silence warning about missing prototype */ 955 int wmain(int argc, XML_Char **argv); 956 #endif 957 958 #define XMLWF_SHIFT_ARG_INTO(constCharStarTarget, argc, argv, i, j) \ 959 { \ 960 if (argv[i][j + 1] == T('\0')) { \ 961 if (++i == argc) { \ 962 usage(argv[0], XMLWF_EXIT_USAGE_ERROR); \ 963 /* usage called exit(..), never gets here */ \ 964 } \ 965 constCharStarTarget = argv[i]; \ 966 } else { \ 967 constCharStarTarget = argv[i] + j + 1; \ 968 } \ 969 i++; \ 970 j = 0; \ 971 } 972 973 int 974 tmain(int argc, XML_Char **argv) { 975 int i, j; 976 const XML_Char *outputDir = NULL; 977 const XML_Char *encoding = NULL; 978 unsigned processFlags = XML_MAP_FILE; 979 int windowsCodePages = 0; 980 int outputType = 0; 981 int useNamespaces = 0; 982 int requireStandalone = 0; 983 int requiresNotations = 0; 984 int continueOnError = 0; 985 986 float attackMaximumAmplification = -1.0f; /* signaling "not set" */ 987 unsigned long long attackThresholdBytes = 0; 988 XML_Bool attackThresholdGiven = XML_FALSE; 989 990 XML_Bool disableDeferral = XML_FALSE; 991 992 int exitCode = XMLWF_EXIT_SUCCESS; 993 enum XML_ParamEntityParsing paramEntityParsing 994 = XML_PARAM_ENTITY_PARSING_NEVER; 995 int useStdin = 0; 996 XmlwfUserData userData = {NULL, NULL, NULL}; 997 998 #ifdef _MSC_VER 999 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); 1000 #endif 1001 1002 i = 1; 1003 j = 0; 1004 while (i < argc) { 1005 if (j == 0) { 1006 if (argv[i][0] != T('-')) 1007 break; 1008 if (argv[i][1] == T('-')) { 1009 if (argv[i][2] == T('\0')) { 1010 i++; 1011 break; 1012 } else if (tcscmp(argv[i] + 2, T("help")) == 0) { 1013 usage(argv[0], XMLWF_EXIT_SUCCESS); 1014 // usage called exit(..), never gets here 1015 } else if (tcscmp(argv[i] + 2, T("version")) == 0) { 1016 showVersion(argv[0]); 1017 return XMLWF_EXIT_SUCCESS; 1018 } 1019 } 1020 j++; 1021 } 1022 switch (argv[i][j]) { 1023 case T('r'): 1024 processFlags &= ~XML_MAP_FILE; 1025 j++; 1026 break; 1027 case T('s'): 1028 requireStandalone = 1; 1029 j++; 1030 break; 1031 case T('n'): 1032 useNamespaces = 1; 1033 j++; 1034 break; 1035 case T('p'): 1036 paramEntityParsing = XML_PARAM_ENTITY_PARSING_ALWAYS; 1037 /* fall through */ 1038 case T('x'): 1039 processFlags |= XML_EXTERNAL_ENTITIES; 1040 j++; 1041 break; 1042 case T('w'): 1043 windowsCodePages = 1; 1044 j++; 1045 break; 1046 case T('m'): 1047 outputType = 'm'; 1048 j++; 1049 break; 1050 case T('c'): 1051 outputType = 'c'; 1052 useNamespaces = 0; 1053 j++; 1054 break; 1055 case T('t'): 1056 outputType = 't'; 1057 j++; 1058 break; 1059 case T('N'): 1060 requiresNotations = 1; 1061 j++; 1062 break; 1063 case T('d'): 1064 XMLWF_SHIFT_ARG_INTO(outputDir, argc, argv, i, j); 1065 break; 1066 case T('e'): 1067 XMLWF_SHIFT_ARG_INTO(encoding, argc, argv, i, j); 1068 break; 1069 case T('h'): 1070 usage(argv[0], XMLWF_EXIT_SUCCESS); 1071 // usage called exit(..), never gets here 1072 case T('v'): 1073 showVersion(argv[0]); 1074 return XMLWF_EXIT_SUCCESS; 1075 case T('g'): { 1076 const XML_Char *valueText = NULL; 1077 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1078 1079 errno = 0; 1080 XML_Char *afterValueText = (XML_Char *)valueText; 1081 const long long read_size_bytes_candidate 1082 = tcstoull(valueText, &afterValueText, 10); 1083 if ((errno != 0) || (afterValueText[0] != T('\0')) 1084 || (read_size_bytes_candidate < 1) 1085 || (read_size_bytes_candidate > (INT_MAX / 2 + 1))) { 1086 // This prevents tperror(..) from reporting misleading "[..]: Success" 1087 errno = ERANGE; 1088 tperror(T("invalid buffer size") T( 1089 " (needs an integer from 1 to INT_MAX/2+1 i.e. 1,073,741,824 on most platforms)")); 1090 exit(XMLWF_EXIT_USAGE_ERROR); 1091 } 1092 g_read_size_bytes = (int)read_size_bytes_candidate; 1093 break; 1094 } 1095 case T('k'): 1096 continueOnError = 1; 1097 j++; 1098 break; 1099 case T('a'): { 1100 const XML_Char *valueText = NULL; 1101 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1102 1103 errno = 0; 1104 XML_Char *afterValueText = NULL; 1105 attackMaximumAmplification = tcstof(valueText, &afterValueText); 1106 if ((errno != 0) || (afterValueText[0] != T('\0')) 1107 || isnan(attackMaximumAmplification) 1108 || (attackMaximumAmplification < 1.0f)) { 1109 // This prevents tperror(..) from reporting misleading "[..]: Success" 1110 errno = ERANGE; 1111 tperror(T("invalid amplification limit") T( 1112 " (needs a floating point number greater or equal than 1.0)")); 1113 exit(XMLWF_EXIT_USAGE_ERROR); 1114 } 1115 #if XML_GE == 0 1116 ftprintf(stderr, 1117 T("Warning: Given amplification limit ignored") 1118 T(", xmlwf has been compiled without DTD/GE support.\n")); 1119 #endif 1120 break; 1121 } 1122 case T('b'): { 1123 const XML_Char *valueText = NULL; 1124 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j); 1125 1126 errno = 0; 1127 XML_Char *afterValueText = (XML_Char *)valueText; 1128 attackThresholdBytes = tcstoull(valueText, &afterValueText, 10); 1129 if ((errno != 0) || (afterValueText[0] != T('\0'))) { 1130 // This prevents tperror(..) from reporting misleading "[..]: Success" 1131 errno = ERANGE; 1132 tperror(T("invalid ignore threshold") 1133 T(" (needs an integer from 0 to 2^64-1)")); 1134 exit(XMLWF_EXIT_USAGE_ERROR); 1135 } 1136 attackThresholdGiven = XML_TRUE; 1137 #if XML_GE == 0 1138 ftprintf(stderr, 1139 T("Warning: Given attack threshold ignored") 1140 T(", xmlwf has been compiled without DTD/GE support.\n")); 1141 #endif 1142 break; 1143 } 1144 case T('q'): { 1145 disableDeferral = XML_TRUE; 1146 j++; 1147 break; 1148 } 1149 case T('\0'): 1150 if (j > 1) { 1151 i++; 1152 j = 0; 1153 break; 1154 } 1155 /* fall through */ 1156 default: 1157 usage(argv[0], XMLWF_EXIT_USAGE_ERROR); 1158 // usage called exit(..), never gets here 1159 } 1160 } 1161 if (i == argc) { 1162 useStdin = 1; 1163 processFlags &= ~XML_MAP_FILE; 1164 i--; 1165 } 1166 for (; i < argc; i++) { 1167 XML_Char *outName = 0; 1168 int result; 1169 XML_Parser parser; 1170 if (useNamespaces) 1171 parser = XML_ParserCreateNS(encoding, NSSEP); 1172 else 1173 parser = XML_ParserCreate(encoding); 1174 1175 if (! parser) { 1176 tperror(T("Could not instantiate parser")); 1177 exit(XMLWF_EXIT_INTERNAL_ERROR); 1178 } 1179 1180 if (attackMaximumAmplification != -1.0f) { 1181 #if XML_GE == 1 1182 XML_SetBillionLaughsAttackProtectionMaximumAmplification( 1183 parser, attackMaximumAmplification); 1184 XML_SetAllocTrackerMaximumAmplification(parser, 1185 attackMaximumAmplification); 1186 #endif 1187 } 1188 if (attackThresholdGiven) { 1189 #if XML_GE == 1 1190 XML_SetBillionLaughsAttackProtectionActivationThreshold( 1191 parser, attackThresholdBytes); 1192 XML_SetAllocTrackerActivationThreshold(parser, attackThresholdBytes); 1193 #else 1194 (void)attackThresholdBytes; // silence -Wunused-but-set-variable 1195 #endif 1196 } 1197 1198 if (disableDeferral) { 1199 const XML_Bool success = XML_SetReparseDeferralEnabled(parser, XML_FALSE); 1200 if (! success) { 1201 // This prevents tperror(..) from reporting misleading "[..]: Success" 1202 errno = EINVAL; 1203 tperror(T("Failed to disable reparse deferral")); 1204 exit(XMLWF_EXIT_INTERNAL_ERROR); 1205 } 1206 } 1207 1208 if (requireStandalone) 1209 XML_SetNotStandaloneHandler(parser, notStandalone); 1210 XML_SetParamEntityParsing(parser, paramEntityParsing); 1211 if (outputType == 't') { 1212 /* This is for doing timings; this gives a more realistic estimate of 1213 the parsing time. */ 1214 outputDir = 0; 1215 XML_SetElementHandler(parser, nopStartElement, nopEndElement); 1216 XML_SetCharacterDataHandler(parser, nopCharacterData); 1217 XML_SetProcessingInstructionHandler(parser, nopProcessingInstruction); 1218 } else if (outputDir) { 1219 const XML_Char *delim = T("/"); 1220 const XML_Char *file = useStdin ? T("STDIN") : argv[i]; 1221 if (! useStdin) { 1222 /* Jump after last (back)slash */ 1223 const XML_Char *lastDelim = tcsrchr(file, delim[0]); 1224 if (lastDelim) 1225 file = lastDelim + 1; 1226 #if defined(_WIN32) 1227 else { 1228 const XML_Char *winDelim = T("\\"); 1229 lastDelim = tcsrchr(file, winDelim[0]); 1230 if (lastDelim) { 1231 file = lastDelim + 1; 1232 delim = winDelim; 1233 } 1234 } 1235 #endif 1236 } 1237 outName 1238 = malloc((tcslen(outputDir) + tcslen(file) + 2) * sizeof(XML_Char)); 1239 if (! outName) { 1240 tperror(T("Could not allocate memory")); 1241 exit(XMLWF_EXIT_INTERNAL_ERROR); 1242 } 1243 tcscpy(outName, outputDir); 1244 tcscat(outName, delim); 1245 tcscat(outName, file); 1246 userData.fp = tfopen(outName, T("wb")); 1247 if (! userData.fp) { 1248 tperror(outName); 1249 exitCode = XMLWF_EXIT_OUTPUT_ERROR; 1250 free(outName); 1251 XML_ParserFree(parser); 1252 if (continueOnError) { 1253 continue; 1254 } else { 1255 break; 1256 } 1257 } 1258 setvbuf(userData.fp, NULL, _IOFBF, 16384); 1259 #ifdef XML_UNICODE 1260 puttc(0xFEFF, userData.fp); 1261 #endif 1262 XML_SetUserData(parser, &userData); 1263 switch (outputType) { 1264 case 'm': 1265 XML_UseParserAsHandlerArg(parser); 1266 XML_SetElementHandler(parser, metaStartElement, metaEndElement); 1267 XML_SetProcessingInstructionHandler(parser, metaProcessingInstruction); 1268 XML_SetCommentHandler(parser, metaComment); 1269 XML_SetCdataSectionHandler(parser, metaStartCdataSection, 1270 metaEndCdataSection); 1271 XML_SetCharacterDataHandler(parser, metaCharacterData); 1272 XML_SetDoctypeDeclHandler(parser, metaStartDoctypeDecl, 1273 metaEndDoctypeDecl); 1274 XML_SetEntityDeclHandler(parser, metaEntityDecl); 1275 XML_SetNotationDeclHandler(parser, metaNotationDecl); 1276 XML_SetNamespaceDeclHandler(parser, metaStartNamespaceDecl, 1277 metaEndNamespaceDecl); 1278 metaStartDocument(parser); 1279 break; 1280 case 'c': 1281 XML_UseParserAsHandlerArg(parser); 1282 XML_SetDefaultHandler(parser, markup); 1283 XML_SetElementHandler(parser, defaultStartElement, defaultEndElement); 1284 XML_SetCharacterDataHandler(parser, defaultCharacterData); 1285 XML_SetProcessingInstructionHandler(parser, 1286 defaultProcessingInstruction); 1287 break; 1288 default: 1289 if (useNamespaces) 1290 XML_SetElementHandler(parser, startElementNS, endElementNS); 1291 else 1292 XML_SetElementHandler(parser, startElement, endElement); 1293 XML_SetCharacterDataHandler(parser, characterData); 1294 #ifndef W3C14N 1295 XML_SetProcessingInstructionHandler(parser, processingInstruction); 1296 if (requiresNotations) { 1297 XML_SetDoctypeDeclHandler(parser, startDoctypeDecl, endDoctypeDecl); 1298 XML_SetNotationDeclHandler(parser, notationDecl); 1299 } 1300 #endif /* not W3C14N */ 1301 break; 1302 } 1303 } 1304 if (windowsCodePages) 1305 XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0); 1306 result = XML_ProcessFile(parser, useStdin ? NULL : argv[i], processFlags); 1307 if (outputDir) { 1308 if (outputType == 'm') 1309 metaEndDocument(parser); 1310 fclose(userData.fp); 1311 if (! result) { 1312 tremove(outName); 1313 } 1314 free(outName); 1315 } 1316 XML_ParserFree(parser); 1317 if (! result) { 1318 exitCode = XMLWF_EXIT_NOT_WELLFORMED; 1319 cleanupUserData(&userData); 1320 if (! continueOnError) { 1321 break; 1322 } 1323 } 1324 } 1325 return exitCode; 1326 } 1327