Home | History | Annotate | Line # | Download | only in isc
      1 /*	$NetBSD: url.c,v 1.6 2025/01/26 16:25:39 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  *
      6  * SPDX-License-Identifier: MPL-2.0 and MIT
      7  *
      8  * This Source Code Form is subject to the terms of the Mozilla Public
      9  * License, v. 2.0. If a copy of the MPL was not distributed with this
     10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
     11  *
     12  * See the COPYRIGHT file distributed with this work for additional
     13  * information regarding copyright ownership.
     14  */
     15 
     16 /*
     17  * Copyright Joyent, Inc. and other Node contributors. All rights reserved.
     18  *
     19  * Permission is hereby granted, free of charge, to any person obtaining a copy
     20  * of this software and associated documentation files (the "Software"), to
     21  * deal in the Software without restriction, including without limitation the
     22  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
     23  * sell copies of the Software, and to permit persons to whom the Software is
     24  * furnished to do so, subject to the following conditions:
     25  *
     26  * The above copyright notice and this permission notice shall be included in
     27  * all copies or substantial portions of the Software.
     28  *
     29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     35  * IN THE SOFTWARE.
     36  */
     37 
     38 #include <ctype.h>
     39 #include <limits.h>
     40 #include <stddef.h>
     41 #include <string.h>
     42 
     43 #include <isc/url.h>
     44 #include <isc/util.h>
     45 
     46 #ifndef BIT_AT
     47 #define BIT_AT(a, i)                                    \
     48 	(!!((unsigned int)(a)[(unsigned int)(i) >> 3] & \
     49 	    (1 << ((unsigned int)(i) & 7))))
     50 #endif
     51 
     52 #if HTTP_PARSER_STRICT
     53 #define T(v) 0
     54 #else
     55 #define T(v) v
     56 #endif
     57 
     58 static const uint8_t normal_url_char[32] = {
     59 	/*   0 nul  1 soh  2 stx  3 etx  4 eot  5 enq  6 ack  7 bel  */
     60 	0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
     61 	/*   8 bs   9 ht  10 nl  11 vt  12 np  13 cr  14 so  15 si */
     62 	0 | T(2) | 0 | 0 | T(16) | 0 | 0 | 0,
     63 	/*  16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */
     64 	0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
     65 	/*  24 can 25 em  26 sub 27 esc 28 fs  29 gs  30 rs  31 us */
     66 	0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
     67 	/*  32 sp  33  !  34  "  35  #  36  $  37  %  38  &  39  ' */
     68 	0 | 2 | 4 | 0 | 16 | 32 | 64 | 128,
     69 	/*  40  (  41  )  42  *  43  +  44  ,  45  -  46  .  47  / */
     70 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
     71 	/*  48  0  49  1  50  2  51  3  52  4  53  5  54  6  55  7 */
     72 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
     73 	/*  56  8  57  9  58  :  59  ;  60  <  61  =  62  >  63  ?  */
     74 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
     75 	/*  64  @  65  A  66  B  67  C  68  D  69  E  70  F  71  G */
     76 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
     77 	/*  72  H  73  I  74  J  75  K  76  L  77  M  78  N  79  O */
     78 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
     79 	/*  80  P  81  Q  82  R  83  S  84  T  85  U  86  V  87  W */
     80 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
     81 	/*  88  X  89  Y  90  Z  91  [  92  \  93  ]  94  ^  95  _ */
     82 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
     83 	/*  96  `  97  a  98  b  99  c 100  d 101  e 102  f 103  g */
     84 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
     85 	/* 104  h 105  i 106  j 107  k 108  l 109  m 110  n 111  o */
     86 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
     87 	/* 112  p 113  q 114  r 115  s 116  t 117  u 118  v 119  w */
     88 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
     89 	/* 120  x 121  y 122  z 123  { 124  | 125  } 126  ~ 127 del */
     90 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
     91 };
     92 
     93 #undef T
     94 
     95 typedef enum {
     96 	s_dead = 1, /* important that this is > 0 */
     97 
     98 	s_start_req_or_res,
     99 	s_res_or_resp_H,
    100 	s_start_res,
    101 	s_res_H,
    102 	s_res_HT,
    103 	s_res_HTT,
    104 	s_res_HTTP,
    105 	s_res_http_major,
    106 	s_res_http_dot,
    107 	s_res_http_minor,
    108 	s_res_http_end,
    109 	s_res_first_status_code,
    110 	s_res_status_code,
    111 	s_res_status_start,
    112 	s_res_status,
    113 	s_res_line_almost_done,
    114 
    115 	s_start_req,
    116 
    117 	s_req_method,
    118 	s_req_spaces_before_url,
    119 	s_req_schema,
    120 	s_req_schema_slash,
    121 	s_req_schema_slash_slash,
    122 	s_req_server_start,
    123 	s_req_server,
    124 	s_req_server_with_at,
    125 	s_req_path,
    126 	s_req_query_string_start,
    127 	s_req_query_string,
    128 	s_req_fragment_start,
    129 	s_req_fragment,
    130 	s_req_http_start,
    131 	s_req_http_H,
    132 	s_req_http_HT,
    133 	s_req_http_HTT,
    134 	s_req_http_HTTP,
    135 	s_req_http_I,
    136 	s_req_http_IC,
    137 	s_req_http_major,
    138 	s_req_http_dot,
    139 	s_req_http_minor,
    140 	s_req_http_end,
    141 	s_req_line_almost_done,
    142 
    143 	s_header_field_start,
    144 	s_header_field,
    145 	s_header_value_discard_ws,
    146 	s_header_value_discard_ws_almost_done,
    147 	s_header_value_discard_lws,
    148 	s_header_value_start,
    149 	s_header_value,
    150 	s_header_value_lws,
    151 
    152 	s_header_almost_done,
    153 
    154 	s_chunk_size_start,
    155 	s_chunk_size,
    156 	s_chunk_parameters,
    157 	s_chunk_size_almost_done,
    158 
    159 	s_headers_almost_done,
    160 	s_headers_done,
    161 
    162 	/*
    163 	 * Important: 's_headers_done' must be the last 'header' state. All
    164 	 * states beyond this must be 'body' states. It is used for overflow
    165 	 * checking. See the PARSING_HEADER() macro.
    166 	 */
    167 
    168 	s_chunk_data,
    169 	s_chunk_data_almost_done,
    170 	s_chunk_data_done,
    171 
    172 	s_body_identity,
    173 	s_body_identity_eof,
    174 
    175 	s_message_done
    176 } state_t;
    177 
    178 typedef enum {
    179 	s_http_host_dead = 1,
    180 	s_http_userinfo_start,
    181 	s_http_userinfo,
    182 	s_http_host_start,
    183 	s_http_host_v6_start,
    184 	s_http_host,
    185 	s_http_host_v6,
    186 	s_http_host_v6_end,
    187 	s_http_host_v6_zone_start,
    188 	s_http_host_v6_zone,
    189 	s_http_host_port_start,
    190 	s_http_host_port
    191 } host_state_t;
    192 
    193 /* Macros for character classes; depends on strict-mode  */
    194 #define IS_MARK(c)                                                             \
    195 	((c) == '-' || (c) == '_' || (c) == '.' || (c) == '!' || (c) == '~' || \
    196 	 (c) == '*' || (c) == '\'' || (c) == '(' || (c) == ')')
    197 #define IS_USERINFO_CHAR(c)                                                    \
    198 	(isalnum((unsigned char)c) || IS_MARK(c) || (c) == '%' ||              \
    199 	 (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
    200 	 (c) == '$' || (c) == ',')
    201 
    202 #if HTTP_PARSER_STRICT
    203 #define IS_URL_CHAR(c)	(BIT_AT(normal_url_char, (unsigned char)c))
    204 #define IS_HOST_CHAR(c) (isalnum((unsigned char)c) || (c) == '.' || (c) == '-')
    205 #else
    206 #define IS_URL_CHAR(c) \
    207 	(BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80))
    208 #define IS_HOST_CHAR(c) \
    209 	(isalnum((unsigned char)c) || (c) == '.' || (c) == '-' || (c) == '_')
    210 #endif
    211 
    212 /*
    213  * Our URL parser.
    214  *
    215  * This is designed to be shared by http_parser_execute() for URL validation,
    216  * hence it has a state transition + byte-for-byte interface. In addition, it
    217  * is meant to be embedded in http_parser_parse_url(), which does the dirty
    218  * work of turning state transitions URL components for its API.
    219  *
    220  * This function should only be invoked with non-space characters. It is
    221  * assumed that the caller cares about (and can detect) the transition between
    222  * URL and non-URL states by looking for these.
    223  */
    224 static state_t
    225 parse_url_char(state_t s, const char ch) {
    226 	if (ch == ' ' || ch == '\r' || ch == '\n') {
    227 		return s_dead;
    228 	}
    229 
    230 #if HTTP_PARSER_STRICT
    231 	if (ch == '\t' || ch == '\f') {
    232 		return s_dead;
    233 	}
    234 #endif
    235 
    236 	switch (s) {
    237 	case s_req_spaces_before_url:
    238 		/* Proxied requests are followed by scheme of an absolute URI
    239 		 * (alpha). All methods except CONNECT are followed by '/' or
    240 		 * '*'.
    241 		 */
    242 
    243 		if (ch == '/' || ch == '*') {
    244 			return s_req_path;
    245 		}
    246 
    247 		if (isalpha((unsigned char)ch)) {
    248 			return s_req_schema;
    249 		}
    250 
    251 		break;
    252 
    253 	case s_req_schema:
    254 		if (isalpha((unsigned char)ch)) {
    255 			return s;
    256 		}
    257 
    258 		if (ch == ':') {
    259 			return s_req_schema_slash;
    260 		}
    261 
    262 		break;
    263 
    264 	case s_req_schema_slash:
    265 		if (ch == '/') {
    266 			return s_req_schema_slash_slash;
    267 		}
    268 
    269 		break;
    270 
    271 	case s_req_schema_slash_slash:
    272 		if (ch == '/') {
    273 			return s_req_server_start;
    274 		}
    275 
    276 		break;
    277 
    278 	case s_req_server_with_at:
    279 		if (ch == '@') {
    280 			return s_dead;
    281 		}
    282 
    283 		FALLTHROUGH;
    284 	case s_req_server_start:
    285 	case s_req_server:
    286 		if (ch == '/') {
    287 			return s_req_path;
    288 		}
    289 
    290 		if (ch == '?') {
    291 			return s_req_query_string_start;
    292 		}
    293 
    294 		if (ch == '@') {
    295 			return s_req_server_with_at;
    296 		}
    297 
    298 		if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
    299 			return s_req_server;
    300 		}
    301 
    302 		break;
    303 
    304 	case s_req_path:
    305 		if (IS_URL_CHAR(ch)) {
    306 			return s;
    307 		}
    308 
    309 		switch (ch) {
    310 		case '?':
    311 			return s_req_query_string_start;
    312 
    313 		case '#':
    314 			return s_req_fragment_start;
    315 		}
    316 
    317 		break;
    318 
    319 	case s_req_query_string_start:
    320 	case s_req_query_string:
    321 		if (IS_URL_CHAR(ch)) {
    322 			return s_req_query_string;
    323 		}
    324 
    325 		switch (ch) {
    326 		case '?':
    327 			/* allow extra '?' in query string */
    328 			return s_req_query_string;
    329 
    330 		case '#':
    331 			return s_req_fragment_start;
    332 		}
    333 
    334 		break;
    335 
    336 	case s_req_fragment_start:
    337 		if (IS_URL_CHAR(ch)) {
    338 			return s_req_fragment;
    339 		}
    340 
    341 		switch (ch) {
    342 		case '?':
    343 			return s_req_fragment;
    344 
    345 		case '#':
    346 			return s;
    347 		}
    348 
    349 		break;
    350 
    351 	case s_req_fragment:
    352 		if (IS_URL_CHAR(ch)) {
    353 			return s;
    354 		}
    355 
    356 		switch (ch) {
    357 		case '?':
    358 		case '#':
    359 			return s;
    360 		}
    361 
    362 		break;
    363 
    364 	default:
    365 		break;
    366 	}
    367 
    368 	/*
    369 	 * We should never fall out of the switch above unless there's an
    370 	 * error.
    371 	 */
    372 	return s_dead;
    373 }
    374 
    375 static host_state_t
    376 http_parse_host_char(host_state_t s, const char ch) {
    377 	switch (s) {
    378 	case s_http_userinfo:
    379 	case s_http_userinfo_start:
    380 		if (ch == '@') {
    381 			return s_http_host_start;
    382 		}
    383 
    384 		if (IS_USERINFO_CHAR(ch)) {
    385 			return s_http_userinfo;
    386 		}
    387 		break;
    388 
    389 	case s_http_host_start:
    390 		if (ch == '[') {
    391 			return s_http_host_v6_start;
    392 		}
    393 
    394 		if (IS_HOST_CHAR(ch)) {
    395 			return s_http_host;
    396 		}
    397 
    398 		break;
    399 
    400 	case s_http_host:
    401 		if (IS_HOST_CHAR(ch)) {
    402 			return s_http_host;
    403 		}
    404 
    405 		FALLTHROUGH;
    406 	case s_http_host_v6_end:
    407 		if (ch == ':') {
    408 			return s_http_host_port_start;
    409 		}
    410 
    411 		break;
    412 
    413 	case s_http_host_v6:
    414 		if (ch == ']') {
    415 			return s_http_host_v6_end;
    416 		}
    417 
    418 		FALLTHROUGH;
    419 	case s_http_host_v6_start:
    420 		if (isxdigit((unsigned char)ch) || ch == ':' || ch == '.') {
    421 			return s_http_host_v6;
    422 		}
    423 
    424 		if (s == s_http_host_v6 && ch == '%') {
    425 			return s_http_host_v6_zone_start;
    426 		}
    427 		break;
    428 
    429 	case s_http_host_v6_zone:
    430 		if (ch == ']') {
    431 			return s_http_host_v6_end;
    432 		}
    433 
    434 		FALLTHROUGH;
    435 	case s_http_host_v6_zone_start:
    436 		/* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
    437 		if (isalnum((unsigned char)ch) || ch == '%' || ch == '.' ||
    438 		    ch == '-' || ch == '_' || ch == '~')
    439 		{
    440 			return s_http_host_v6_zone;
    441 		}
    442 		break;
    443 
    444 	case s_http_host_port:
    445 	case s_http_host_port_start:
    446 		if (isdigit((unsigned char)ch)) {
    447 			return s_http_host_port;
    448 		}
    449 
    450 		break;
    451 
    452 	default:
    453 		break;
    454 	}
    455 
    456 	return s_http_host_dead;
    457 }
    458 
    459 static isc_result_t
    460 http_parse_host(const char *buf, isc_url_parser_t *up, int found_at) {
    461 	host_state_t s;
    462 	const char *p = NULL;
    463 	size_t buflen = up->field_data[ISC_UF_HOST].off +
    464 			up->field_data[ISC_UF_HOST].len;
    465 
    466 	REQUIRE((up->field_set & (1 << ISC_UF_HOST)) != 0);
    467 
    468 	up->field_data[ISC_UF_HOST].len = 0;
    469 
    470 	s = found_at ? s_http_userinfo_start : s_http_host_start;
    471 
    472 	for (p = buf + up->field_data[ISC_UF_HOST].off; p < buf + buflen; p++) {
    473 		host_state_t new_s = http_parse_host_char(s, *p);
    474 
    475 		if (new_s == s_http_host_dead) {
    476 			return ISC_R_FAILURE;
    477 		}
    478 
    479 		switch (new_s) {
    480 		case s_http_host:
    481 			if (s != s_http_host) {
    482 				up->field_data[ISC_UF_HOST].off =
    483 					(uint16_t)(p - buf);
    484 			}
    485 			up->field_data[ISC_UF_HOST].len++;
    486 			break;
    487 
    488 		case s_http_host_v6:
    489 			if (s != s_http_host_v6) {
    490 				up->field_data[ISC_UF_HOST].off =
    491 					(uint16_t)(p - buf);
    492 			}
    493 			up->field_data[ISC_UF_HOST].len++;
    494 			break;
    495 
    496 		case s_http_host_v6_zone_start:
    497 		case s_http_host_v6_zone:
    498 			up->field_data[ISC_UF_HOST].len++;
    499 			break;
    500 
    501 		case s_http_host_port:
    502 			if (s != s_http_host_port) {
    503 				up->field_data[ISC_UF_PORT].off =
    504 					(uint16_t)(p - buf);
    505 				up->field_data[ISC_UF_PORT].len = 0;
    506 				up->field_set |= (1 << ISC_UF_PORT);
    507 			}
    508 			up->field_data[ISC_UF_PORT].len++;
    509 			break;
    510 
    511 		case s_http_userinfo:
    512 			if (s != s_http_userinfo) {
    513 				up->field_data[ISC_UF_USERINFO].off =
    514 					(uint16_t)(p - buf);
    515 				up->field_data[ISC_UF_USERINFO].len = 0;
    516 				up->field_set |= (1 << ISC_UF_USERINFO);
    517 			}
    518 			up->field_data[ISC_UF_USERINFO].len++;
    519 			break;
    520 
    521 		default:
    522 			break;
    523 		}
    524 
    525 		s = new_s;
    526 	}
    527 
    528 	/* Make sure we don't end somewhere unexpected */
    529 	switch (s) {
    530 	case s_http_host_start:
    531 	case s_http_host_v6_start:
    532 	case s_http_host_v6:
    533 	case s_http_host_v6_zone_start:
    534 	case s_http_host_v6_zone:
    535 	case s_http_host_port_start:
    536 	case s_http_userinfo:
    537 	case s_http_userinfo_start:
    538 		return ISC_R_FAILURE;
    539 	default:
    540 		break;
    541 	}
    542 
    543 	return ISC_R_SUCCESS;
    544 }
    545 
    546 isc_result_t
    547 isc_url_parse(const char *buf, size_t buflen, bool is_connect,
    548 	      isc_url_parser_t *up) {
    549 	state_t s;
    550 	isc_url_field_t uf, old_uf;
    551 	int found_at = 0;
    552 	const char *p = NULL;
    553 
    554 	if (buflen == 0) {
    555 		return ISC_R_FAILURE;
    556 	}
    557 
    558 	up->port = up->field_set = 0;
    559 	s = is_connect ? s_req_server_start : s_req_spaces_before_url;
    560 	old_uf = ISC_UF_MAX;
    561 
    562 	for (p = buf; p < buf + buflen; p++) {
    563 		s = parse_url_char(s, *p);
    564 
    565 		/* Figure out the next field that we're operating on */
    566 		switch (s) {
    567 		case s_dead:
    568 			return ISC_R_FAILURE;
    569 
    570 		/* Skip delimiters */
    571 		case s_req_schema_slash:
    572 		case s_req_schema_slash_slash:
    573 		case s_req_server_start:
    574 		case s_req_query_string_start:
    575 		case s_req_fragment_start:
    576 			continue;
    577 
    578 		case s_req_schema:
    579 			uf = ISC_UF_SCHEMA;
    580 			break;
    581 
    582 		case s_req_server_with_at:
    583 			found_at = 1;
    584 			FALLTHROUGH;
    585 		case s_req_server:
    586 			uf = ISC_UF_HOST;
    587 			break;
    588 
    589 		case s_req_path:
    590 			uf = ISC_UF_PATH;
    591 			break;
    592 
    593 		case s_req_query_string:
    594 			uf = ISC_UF_QUERY;
    595 			break;
    596 
    597 		case s_req_fragment:
    598 			uf = ISC_UF_FRAGMENT;
    599 			break;
    600 
    601 		default:
    602 			UNREACHABLE();
    603 		}
    604 
    605 		/* Nothing's changed; soldier on */
    606 		if (uf == old_uf) {
    607 			up->field_data[uf].len++;
    608 			continue;
    609 		}
    610 
    611 		up->field_data[uf].off = (uint16_t)(p - buf);
    612 		up->field_data[uf].len = 1;
    613 
    614 		up->field_set |= (1 << uf);
    615 		old_uf = uf;
    616 	}
    617 
    618 	/* host must be present if there is a schema */
    619 	/* parsing http:///toto will fail */
    620 	if ((up->field_set & (1 << ISC_UF_SCHEMA)) &&
    621 	    (up->field_set & (1 << ISC_UF_HOST)) == 0)
    622 	{
    623 		return ISC_R_FAILURE;
    624 	}
    625 
    626 	if (up->field_set & (1 << ISC_UF_HOST)) {
    627 		isc_result_t result;
    628 
    629 		result = http_parse_host(buf, up, found_at);
    630 		if (result != ISC_R_SUCCESS) {
    631 			return result;
    632 		}
    633 	}
    634 
    635 	/* CONNECT requests can only contain "hostname:port" */
    636 	if (is_connect &&
    637 	    up->field_set != ((1 << ISC_UF_HOST) | (1 << ISC_UF_PORT)))
    638 	{
    639 		return ISC_R_FAILURE;
    640 	}
    641 
    642 	if (up->field_set & (1 << ISC_UF_PORT)) {
    643 		uint16_t off;
    644 		uint16_t len;
    645 		const char *pp = NULL;
    646 		const char *end = NULL;
    647 		unsigned long v;
    648 
    649 		off = up->field_data[ISC_UF_PORT].off;
    650 		len = up->field_data[ISC_UF_PORT].len;
    651 		end = buf + off + len;
    652 
    653 		/*
    654 		 * NOTE: The characters are already validated and are in the
    655 		 * [0-9] range
    656 		 */
    657 		INSIST(off + len <= buflen);
    658 
    659 		v = 0;
    660 		for (pp = buf + off; pp < end; pp++) {
    661 			v *= 10;
    662 			v += *pp - '0';
    663 
    664 			/* Ports have a max value of 2^16 */
    665 			if (v > 0xffff) {
    666 				return ISC_R_RANGE;
    667 			}
    668 		}
    669 
    670 		up->port = (uint16_t)v;
    671 	}
    672 
    673 	return ISC_R_SUCCESS;
    674 }
    675