Home | History | Annotate | Line # | Download | only in dist
optimize.c revision 1.13
      1 /*	$NetBSD: optimize.c,v 1.13 2024/09/02 15:33:37 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that: (1) source code distributions
      9  * retain the above copyright notice and this paragraph in its entirety, (2)
     10  * distributions including binary code include the above copyright notice and
     11  * this paragraph in its entirety in the documentation or other materials
     12  * provided with the distribution, and (3) all advertising materials mentioning
     13  * features or use of this software display the following acknowledgement:
     14  * ``This product includes software developed by the University of California,
     15  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
     16  * the University nor the names of its contributors may be used to endorse
     17  * or promote products derived from this software without specific prior
     18  * written permission.
     19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
     20  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
     21  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
     22  *
     23  *  Optimization module for BPF code intermediate representation.
     24  */
     25 
     26 #include <sys/cdefs.h>
     27 __RCSID("$NetBSD: optimize.c,v 1.13 2024/09/02 15:33:37 christos Exp $");
     28 
     29 #include <config.h>
     30 
     31 #include <pcap-types.h>
     32 
     33 #include <stdio.h>
     34 #include <stdlib.h>
     35 #include <memory.h>
     36 #include <setjmp.h>
     37 #include <string.h>
     38 #include <limits.h> /* for SIZE_MAX */
     39 #include <errno.h>
     40 
     41 #include "pcap-int.h"
     42 
     43 #include "gencode.h"
     44 #include "optimize.h"
     45 #include "diag-control.h"
     46 
     47 #ifdef HAVE_OS_PROTO_H
     48 #include "os-proto.h"
     49 #endif
     50 
     51 #ifdef BDEBUG
     52 /*
     53  * The internal "debug printout" flag for the filter expression optimizer.
     54  * The code to print that stuff is present only if BDEBUG is defined, so
     55  * the flag, and the routine to set it, are defined only if BDEBUG is
     56  * defined.
     57  */
     58 static int pcap_optimizer_debug;
     59 
     60 /*
     61  * Routine to set that flag.
     62  *
     63  * This is intended for libpcap developers, not for general use.
     64  * If you want to set these in a program, you'll have to declare this
     65  * routine yourself, with the appropriate DLL import attribute on Windows;
     66  * it's not declared in any header file, and won't be declared in any
     67  * header file provided by libpcap.
     68  */
     69 PCAP_API void pcap_set_optimizer_debug(int value);
     70 
     71 PCAP_API_DEF void
     72 pcap_set_optimizer_debug(int value)
     73 {
     74 	pcap_optimizer_debug = value;
     75 }
     76 
     77 /*
     78  * The internal "print dot graph" flag for the filter expression optimizer.
     79  * The code to print that stuff is present only if BDEBUG is defined, so
     80  * the flag, and the routine to set it, are defined only if BDEBUG is
     81  * defined.
     82  */
     83 static int pcap_print_dot_graph;
     84 
     85 /*
     86  * Routine to set that flag.
     87  *
     88  * This is intended for libpcap developers, not for general use.
     89  * If you want to set these in a program, you'll have to declare this
     90  * routine yourself, with the appropriate DLL import attribute on Windows;
     91  * it's not declared in any header file, and won't be declared in any
     92  * header file provided by libpcap.
     93  */
     94 PCAP_API void pcap_set_print_dot_graph(int value);
     95 
     96 PCAP_API_DEF void
     97 pcap_set_print_dot_graph(int value)
     98 {
     99 	pcap_print_dot_graph = value;
    100 }
    101 
    102 #endif
    103 
    104 /*
    105  * lowest_set_bit().
    106  *
    107  * Takes a 32-bit integer as an argument.
    108  *
    109  * If handed a non-zero value, returns the index of the lowest set bit,
    110  * counting upwards from zero.
    111  *
    112  * If handed zero, the results are platform- and compiler-dependent.
    113  * Keep it out of the light, don't give it any water, don't feed it
    114  * after midnight, and don't pass zero to it.
    115  *
    116  * This is the same as the count of trailing zeroes in the word.
    117  */
    118 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
    119   /*
    120    * GCC 3.4 and later; we have __builtin_ctz().
    121    */
    122   #define lowest_set_bit(mask) ((u_int)__builtin_ctz(mask))
    123 #elif defined(_MSC_VER)
    124   /*
    125    * Visual Studio; we support only 2005 and later, so use
    126    * _BitScanForward().
    127    */
    128 #include <intrin.h>
    129 
    130 #ifndef __clang__
    131 #pragma intrinsic(_BitScanForward)
    132 #endif
    133 
    134 static __forceinline u_int
    135 lowest_set_bit(int mask)
    136 {
    137 	unsigned long bit;
    138 
    139 	/*
    140 	 * Don't sign-extend mask if long is longer than int.
    141 	 * (It's currently not, in MSVC, even on 64-bit platforms, but....)
    142 	 */
    143 	if (_BitScanForward(&bit, (unsigned int)mask) == 0)
    144 		abort();	/* mask is zero */
    145 	return (u_int)bit;
    146 }
    147 #elif defined(MSDOS) && defined(__DJGPP__)
    148   /*
    149    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
    150    * we've already included.
    151    */
    152   #define lowest_set_bit(mask)	((u_int)(ffs((mask)) - 1))
    153 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
    154   /*
    155    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
    156    * or some other platform (UN*X conforming to a sufficient recent version
    157    * of the Single UNIX Specification).
    158    */
    159   #include <strings.h>
    160   #define lowest_set_bit(mask)	(u_int)((ffs((mask)) - 1))
    161 #else
    162 /*
    163  * None of the above.
    164  * Use a perfect-hash-function-based function.
    165  */
    166 static u_int
    167 lowest_set_bit(int mask)
    168 {
    169 	unsigned int v = (unsigned int)mask;
    170 
    171 	static const u_int MultiplyDeBruijnBitPosition[32] = {
    172 		0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
    173 		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
    174 	};
    175 
    176 	/*
    177 	 * We strip off all but the lowermost set bit (v & ~v),
    178 	 * and perform a minimal perfect hash on it to look up the
    179 	 * number of low-order zero bits in a table.
    180 	 *
    181 	 * See:
    182 	 *
    183 	 *	http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
    184 	 *
    185 	 *	http://supertech.csail.mit.edu/papers/debruijn.pdf
    186 	 */
    187 	return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
    188 }
    189 #endif
    190 
    191 /*
    192  * Represents a deleted instruction.
    193  */
    194 #define NOP -1
    195 
    196 /*
    197  * Register numbers for use-def values.
    198  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
    199  * location.  A_ATOM is the accumulator and X_ATOM is the index
    200  * register.
    201  */
    202 #define A_ATOM BPF_MEMWORDS
    203 #define X_ATOM (BPF_MEMWORDS+1)
    204 
    205 /*
    206  * This define is used to represent *both* the accumulator and
    207  * x register in use-def computations.
    208  * Currently, the use-def code assumes only one definition per instruction.
    209  */
    210 #define AX_ATOM N_ATOMS
    211 
    212 /*
    213  * These data structures are used in a Cocke and Schwartz style
    214  * value numbering scheme.  Since the flowgraph is acyclic,
    215  * exit values can be propagated from a node's predecessors
    216  * provided it is uniquely defined.
    217  */
    218 struct valnode {
    219 	int code;
    220 	bpf_u_int32 v0, v1;
    221 	int val;		/* the value number */
    222 	struct valnode *next;
    223 };
    224 
    225 /* Integer constants mapped with the load immediate opcode. */
    226 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0U)
    227 
    228 struct vmapinfo {
    229 	int is_const;
    230 	bpf_u_int32 const_val;
    231 };
    232 
    233 typedef struct {
    234 	/*
    235 	 * Place to longjmp to on an error.
    236 	 */
    237 	jmp_buf top_ctx;
    238 
    239 	/*
    240 	 * The buffer into which to put error message.
    241 	 */
    242 	char *errbuf;
    243 
    244 	/*
    245 	 * A flag to indicate that further optimization is needed.
    246 	 * Iterative passes are continued until a given pass yields no
    247 	 * code simplification or branch movement.
    248 	 */
    249 	int done;
    250 
    251 	/*
    252 	 * XXX - detect loops that do nothing but repeated AND/OR pullups
    253 	 * and edge moves.
    254 	 * If 100 passes in a row do nothing but that, treat that as a
    255 	 * sign that we're in a loop that just shuffles in a cycle in
    256 	 * which each pass just shuffles the code and we eventually
    257 	 * get back to the original configuration.
    258 	 *
    259 	 * XXX - we need a non-heuristic way of detecting, or preventing,
    260 	 * such a cycle.
    261 	 */
    262 	int non_branch_movement_performed;
    263 
    264 	u_int n_blocks;		/* number of blocks in the CFG; guaranteed to be > 0, as it's a RET instruction at a minimum */
    265 	struct block **blocks;
    266 	u_int n_edges;		/* twice n_blocks, so guaranteed to be > 0 */
    267 	struct edge **edges;
    268 
    269 	/*
    270 	 * A bit vector set representation of the dominators.
    271 	 * We round up the set size to the next power of two.
    272 	 */
    273 	u_int nodewords;	/* number of 32-bit words for a bit vector of "number of nodes" bits; guaranteed to be > 0 */
    274 	u_int edgewords;	/* number of 32-bit words for a bit vector of "number of edges" bits; guaranteed to be > 0 */
    275 	struct block **levels;
    276 	bpf_u_int32 *space;
    277 
    278 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
    279 /*
    280  * True if a is in uset {p}
    281  */
    282 #define SET_MEMBER(p, a) \
    283 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
    284 
    285 /*
    286  * Add 'a' to uset p.
    287  */
    288 #define SET_INSERT(p, a) \
    289 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
    290 
    291 /*
    292  * Delete 'a' from uset p.
    293  */
    294 #define SET_DELETE(p, a) \
    295 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
    296 
    297 /*
    298  * a := a intersect b
    299  * n must be guaranteed to be > 0
    300  */
    301 #define SET_INTERSECT(a, b, n)\
    302 {\
    303 	register bpf_u_int32 *_x = a, *_y = b;\
    304 	register u_int _n = n;\
    305 	do *_x++ &= *_y++; while (--_n != 0);\
    306 }
    307 
    308 /*
    309  * a := a - b
    310  * n must be guaranteed to be > 0
    311  */
    312 #define SET_SUBTRACT(a, b, n)\
    313 {\
    314 	register bpf_u_int32 *_x = a, *_y = b;\
    315 	register u_int _n = n;\
    316 	do *_x++ &=~ *_y++; while (--_n != 0);\
    317 }
    318 
    319 /*
    320  * a := a union b
    321  * n must be guaranteed to be > 0
    322  */
    323 #define SET_UNION(a, b, n)\
    324 {\
    325 	register bpf_u_int32 *_x = a, *_y = b;\
    326 	register u_int _n = n;\
    327 	do *_x++ |= *_y++; while (--_n != 0);\
    328 }
    329 
    330 	uset all_dom_sets;
    331 	uset all_closure_sets;
    332 	uset all_edge_sets;
    333 
    334 #define MODULUS 213
    335 	struct valnode *hashtbl[MODULUS];
    336 	bpf_u_int32 curval;
    337 	bpf_u_int32 maxval;
    338 
    339 	struct vmapinfo *vmap;
    340 	struct valnode *vnode_base;
    341 	struct valnode *next_vnode;
    342 } opt_state_t;
    343 
    344 typedef struct {
    345 	/*
    346 	 * Place to longjmp to on an error.
    347 	 */
    348 	jmp_buf top_ctx;
    349 
    350 	/*
    351 	 * The buffer into which to put error message.
    352 	 */
    353 	char *errbuf;
    354 
    355 	/*
    356 	 * Some pointers used to convert the basic block form of the code,
    357 	 * into the array form that BPF requires.  'fstart' will point to
    358 	 * the malloc'd array while 'ftail' is used during the recursive
    359 	 * traversal.
    360 	 */
    361 	struct bpf_insn *fstart;
    362 	struct bpf_insn *ftail;
    363 } conv_state_t;
    364 
    365 static void opt_init(opt_state_t *, struct icode *);
    366 static void opt_cleanup(opt_state_t *);
    367 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
    368     PCAP_PRINTFLIKE(2, 3);
    369 
    370 static void intern_blocks(opt_state_t *, struct icode *);
    371 
    372 static void find_inedges(opt_state_t *, struct block *);
    373 #ifdef BDEBUG
    374 static void opt_dump(opt_state_t *, struct icode *);
    375 #endif
    376 
    377 #ifndef MAX
    378 #define MAX(a,b) ((a)>(b)?(a):(b))
    379 #endif
    380 
    381 static void
    382 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
    383 {
    384 	int level;
    385 
    386 	if (isMarked(ic, b))
    387 		return;
    388 
    389 	Mark(ic, b);
    390 	b->link = 0;
    391 
    392 	if (JT(b)) {
    393 		find_levels_r(opt_state, ic, JT(b));
    394 		find_levels_r(opt_state, ic, JF(b));
    395 		level = MAX(JT(b)->level, JF(b)->level) + 1;
    396 	} else
    397 		level = 0;
    398 	b->level = level;
    399 	b->link = opt_state->levels[level];
    400 	opt_state->levels[level] = b;
    401 }
    402 
    403 /*
    404  * Level graph.  The levels go from 0 at the leaves to
    405  * N_LEVELS at the root.  The opt_state->levels[] array points to the
    406  * first node of the level list, whose elements are linked
    407  * with the 'link' field of the struct block.
    408  */
    409 static void
    410 find_levels(opt_state_t *opt_state, struct icode *ic)
    411 {
    412 	memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
    413 	unMarkAll(ic);
    414 	find_levels_r(opt_state, ic, ic->root);
    415 }
    416 
    417 /*
    418  * Find dominator relationships.
    419  * Assumes graph has been leveled.
    420  */
    421 static void
    422 find_dom(opt_state_t *opt_state, struct block *root)
    423 {
    424 	u_int i;
    425 	int level;
    426 	struct block *b;
    427 	bpf_u_int32 *x;
    428 
    429 	/*
    430 	 * Initialize sets to contain all nodes.
    431 	 */
    432 	x = opt_state->all_dom_sets;
    433 	/*
    434 	 * In opt_init(), we've made sure the product doesn't overflow.
    435 	 */
    436 	i = opt_state->n_blocks * opt_state->nodewords;
    437 	while (i != 0) {
    438 		--i;
    439 		*x++ = 0xFFFFFFFFU;
    440 	}
    441 	/* Root starts off empty. */
    442 	for (i = opt_state->nodewords; i != 0;) {
    443 		--i;
    444 		root->dom[i] = 0;
    445 	}
    446 
    447 	/* root->level is the highest level no found. */
    448 	for (level = root->level; level >= 0; --level) {
    449 		for (b = opt_state->levels[level]; b; b = b->link) {
    450 			SET_INSERT(b->dom, b->id);
    451 			if (JT(b) == 0)
    452 				continue;
    453 			SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
    454 			SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
    455 		}
    456 	}
    457 }
    458 
    459 static void
    460 propedom(opt_state_t *opt_state, struct edge *ep)
    461 {
    462 	SET_INSERT(ep->edom, ep->id);
    463 	if (ep->succ) {
    464 		SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
    465 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
    466 	}
    467 }
    468 
    469 /*
    470  * Compute edge dominators.
    471  * Assumes graph has been leveled and predecessors established.
    472  */
    473 static void
    474 find_edom(opt_state_t *opt_state, struct block *root)
    475 {
    476 	u_int i;
    477 	uset x;
    478 	int level;
    479 	struct block *b;
    480 
    481 	x = opt_state->all_edge_sets;
    482 	/*
    483 	 * In opt_init(), we've made sure the product doesn't overflow.
    484 	 */
    485 	for (i = opt_state->n_edges * opt_state->edgewords; i != 0; ) {
    486 		--i;
    487 		x[i] = 0xFFFFFFFFU;
    488 	}
    489 
    490 	/* root->level is the highest level no found. */
    491 	memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
    492 	memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
    493 	for (level = root->level; level >= 0; --level) {
    494 		for (b = opt_state->levels[level]; b != 0; b = b->link) {
    495 			propedom(opt_state, &b->et);
    496 			propedom(opt_state, &b->ef);
    497 		}
    498 	}
    499 }
    500 
    501 /*
    502  * Find the backwards transitive closure of the flow graph.  These sets
    503  * are backwards in the sense that we find the set of nodes that reach
    504  * a given node, not the set of nodes that can be reached by a node.
    505  *
    506  * Assumes graph has been leveled.
    507  */
    508 static void
    509 find_closure(opt_state_t *opt_state, struct block *root)
    510 {
    511 	int level;
    512 	struct block *b;
    513 
    514 	/*
    515 	 * Initialize sets to contain no nodes.
    516 	 */
    517 	memset((char *)opt_state->all_closure_sets, 0,
    518 	      opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
    519 
    520 	/* root->level is the highest level no found. */
    521 	for (level = root->level; level >= 0; --level) {
    522 		for (b = opt_state->levels[level]; b; b = b->link) {
    523 			SET_INSERT(b->closure, b->id);
    524 			if (JT(b) == 0)
    525 				continue;
    526 			SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
    527 			SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
    528 		}
    529 	}
    530 }
    531 
    532 /*
    533  * Return the register number that is used by s.
    534  *
    535  * Returns ATOM_A if A is used, ATOM_X if X is used, AX_ATOM if both A and X
    536  * are used, the scratch memory location's number if a scratch memory
    537  * location is used (e.g., 0 for M[0]), or -1 if none of those are used.
    538  *
    539  * The implementation should probably change to an array access.
    540  */
    541 static int
    542 atomuse(struct stmt *s)
    543 {
    544 	register int c = s->code;
    545 
    546 	if (c == NOP)
    547 		return -1;
    548 
    549 	switch (BPF_CLASS(c)) {
    550 
    551 	case BPF_RET:
    552 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
    553 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
    554 
    555 	case BPF_LD:
    556 	case BPF_LDX:
    557 		/*
    558 		 * As there are fewer than 2^31 memory locations,
    559 		 * s->k should be convertible to int without problems.
    560 		 */
    561 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
    562 			(BPF_MODE(c) == BPF_MEM) ? (int)s->k : -1;
    563 
    564 	case BPF_ST:
    565 		return A_ATOM;
    566 
    567 	case BPF_STX:
    568 		return X_ATOM;
    569 
    570 	case BPF_JMP:
    571 	case BPF_ALU:
    572 		if (BPF_SRC(c) == BPF_X)
    573 			return AX_ATOM;
    574 		return A_ATOM;
    575 
    576 	case BPF_MISC:
    577 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
    578 	}
    579 	abort();
    580 	/* NOTREACHED */
    581 }
    582 
    583 /*
    584  * Return the register number that is defined by 's'.  We assume that
    585  * a single stmt cannot define more than one register.  If no register
    586  * is defined, return -1.
    587  *
    588  * The implementation should probably change to an array access.
    589  */
    590 static int
    591 atomdef(struct stmt *s)
    592 {
    593 	if (s->code == NOP)
    594 		return -1;
    595 
    596 	switch (BPF_CLASS(s->code)) {
    597 
    598 	case BPF_LD:
    599 	case BPF_ALU:
    600 		return A_ATOM;
    601 
    602 	case BPF_LDX:
    603 		return X_ATOM;
    604 
    605 	case BPF_ST:
    606 	case BPF_STX:
    607 		return s->k;
    608 
    609 	case BPF_MISC:
    610 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
    611 	}
    612 	return -1;
    613 }
    614 
    615 /*
    616  * Compute the sets of registers used, defined, and killed by 'b'.
    617  *
    618  * "Used" means that a statement in 'b' uses the register before any
    619  * statement in 'b' defines it, i.e. it uses the value left in
    620  * that register by a predecessor block of this block.
    621  * "Defined" means that a statement in 'b' defines it.
    622  * "Killed" means that a statement in 'b' defines it before any
    623  * statement in 'b' uses it, i.e. it kills the value left in that
    624  * register by a predecessor block of this block.
    625  */
    626 static void
    627 compute_local_ud(struct block *b)
    628 {
    629 	struct slist *s;
    630 	atomset def = 0, use = 0, killed = 0;
    631 	int atom;
    632 
    633 	for (s = b->stmts; s; s = s->next) {
    634 		if (s->s.code == NOP)
    635 			continue;
    636 		atom = atomuse(&s->s);
    637 		if (atom >= 0) {
    638 			if (atom == AX_ATOM) {
    639 				if (!ATOMELEM(def, X_ATOM))
    640 					use |= ATOMMASK(X_ATOM);
    641 				if (!ATOMELEM(def, A_ATOM))
    642 					use |= ATOMMASK(A_ATOM);
    643 			}
    644 			else if (atom < N_ATOMS) {
    645 				if (!ATOMELEM(def, atom))
    646 					use |= ATOMMASK(atom);
    647 			}
    648 			else
    649 				abort();
    650 		}
    651 		atom = atomdef(&s->s);
    652 		if (atom >= 0) {
    653 			if (!ATOMELEM(use, atom))
    654 				killed |= ATOMMASK(atom);
    655 			def |= ATOMMASK(atom);
    656 		}
    657 	}
    658 	if (BPF_CLASS(b->s.code) == BPF_JMP) {
    659 		/*
    660 		 * XXX - what about RET?
    661 		 */
    662 		atom = atomuse(&b->s);
    663 		if (atom >= 0) {
    664 			if (atom == AX_ATOM) {
    665 				if (!ATOMELEM(def, X_ATOM))
    666 					use |= ATOMMASK(X_ATOM);
    667 				if (!ATOMELEM(def, A_ATOM))
    668 					use |= ATOMMASK(A_ATOM);
    669 			}
    670 			else if (atom < N_ATOMS) {
    671 				if (!ATOMELEM(def, atom))
    672 					use |= ATOMMASK(atom);
    673 			}
    674 			else
    675 				abort();
    676 		}
    677 	}
    678 
    679 	b->def = def;
    680 	b->kill = killed;
    681 	b->in_use = use;
    682 }
    683 
    684 /*
    685  * Assume graph is already leveled.
    686  */
    687 static void
    688 find_ud(opt_state_t *opt_state, struct block *root)
    689 {
    690 	int i, maxlevel;
    691 	struct block *p;
    692 
    693 	/*
    694 	 * root->level is the highest level no found;
    695 	 * count down from there.
    696 	 */
    697 	maxlevel = root->level;
    698 	for (i = maxlevel; i >= 0; --i)
    699 		for (p = opt_state->levels[i]; p; p = p->link) {
    700 			compute_local_ud(p);
    701 			p->out_use = 0;
    702 		}
    703 
    704 	for (i = 1; i <= maxlevel; ++i) {
    705 		for (p = opt_state->levels[i]; p; p = p->link) {
    706 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
    707 			p->in_use |= p->out_use &~ p->kill;
    708 		}
    709 	}
    710 }
    711 static void
    712 init_val(opt_state_t *opt_state)
    713 {
    714 	opt_state->curval = 0;
    715 	opt_state->next_vnode = opt_state->vnode_base;
    716 	memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
    717 	memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
    718 }
    719 
    720 /*
    721  * Because we really don't have an IR, this stuff is a little messy.
    722  *
    723  * This routine looks in the table of existing value number for a value
    724  * with generated from an operation with the specified opcode and
    725  * the specified values.  If it finds it, it returns its value number,
    726  * otherwise it makes a new entry in the table and returns the
    727  * value number of that entry.
    728  */
    729 static bpf_u_int32
    730 F(opt_state_t *opt_state, int code, bpf_u_int32 v0, bpf_u_int32 v1)
    731 {
    732 	u_int hash;
    733 	bpf_u_int32 val;
    734 	struct valnode *p;
    735 
    736 	hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
    737 	hash %= MODULUS;
    738 
    739 	for (p = opt_state->hashtbl[hash]; p; p = p->next)
    740 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
    741 			return p->val;
    742 
    743 	/*
    744 	 * Not found.  Allocate a new value, and assign it a new
    745 	 * value number.
    746 	 *
    747 	 * opt_state->curval starts out as 0, which means VAL_UNKNOWN; we
    748 	 * increment it before using it as the new value number, which
    749 	 * means we never assign VAL_UNKNOWN.
    750 	 *
    751 	 * XXX - unless we overflow, but we probably won't have 2^32-1
    752 	 * values; we treat 32 bits as effectively infinite.
    753 	 */
    754 	val = ++opt_state->curval;
    755 	if (BPF_MODE(code) == BPF_IMM &&
    756 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
    757 		opt_state->vmap[val].const_val = v0;
    758 		opt_state->vmap[val].is_const = 1;
    759 	}
    760 	p = opt_state->next_vnode++;
    761 	p->val = val;
    762 	p->code = code;
    763 	p->v0 = v0;
    764 	p->v1 = v1;
    765 	p->next = opt_state->hashtbl[hash];
    766 	opt_state->hashtbl[hash] = p;
    767 
    768 	return val;
    769 }
    770 
    771 static inline void
    772 vstore(struct stmt *s, bpf_u_int32 *valp, bpf_u_int32 newval, int alter)
    773 {
    774 	if (alter && newval != VAL_UNKNOWN && *valp == newval)
    775 		s->code = NOP;
    776 	else
    777 		*valp = newval;
    778 }
    779 
    780 /*
    781  * Do constant-folding on binary operators.
    782  * (Unary operators are handled elsewhere.)
    783  */
    784 static void
    785 fold_op(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 v0, bpf_u_int32 v1)
    786 {
    787 	bpf_u_int32 a, b;
    788 
    789 	a = opt_state->vmap[v0].const_val;
    790 	b = opt_state->vmap[v1].const_val;
    791 
    792 	switch (BPF_OP(s->code)) {
    793 	case BPF_ADD:
    794 		a += b;
    795 		break;
    796 
    797 	case BPF_SUB:
    798 		a -= b;
    799 		break;
    800 
    801 	case BPF_MUL:
    802 		a *= b;
    803 		break;
    804 
    805 	case BPF_DIV:
    806 		if (b == 0)
    807 			opt_error(opt_state, "division by zero");
    808 		a /= b;
    809 		break;
    810 
    811 	case BPF_MOD:
    812 		if (b == 0)
    813 			opt_error(opt_state, "modulus by zero");
    814 		a %= b;
    815 		break;
    816 
    817 	case BPF_AND:
    818 		a &= b;
    819 		break;
    820 
    821 	case BPF_OR:
    822 		a |= b;
    823 		break;
    824 
    825 	case BPF_XOR:
    826 		a ^= b;
    827 		break;
    828 
    829 	case BPF_LSH:
    830 		/*
    831 		 * A left shift of more than the width of the type
    832 		 * is undefined in C; we'll just treat it as shifting
    833 		 * all the bits out.
    834 		 *
    835 		 * XXX - the BPF interpreter doesn't check for this,
    836 		 * so its behavior is dependent on the behavior of
    837 		 * the processor on which it's running.  There are
    838 		 * processors on which it shifts all the bits out
    839 		 * and processors on which it does no shift.
    840 		 */
    841 		if (b < 32)
    842 			a <<= b;
    843 		else
    844 			a = 0;
    845 		break;
    846 
    847 	case BPF_RSH:
    848 		/*
    849 		 * A right shift of more than the width of the type
    850 		 * is undefined in C; we'll just treat it as shifting
    851 		 * all the bits out.
    852 		 *
    853 		 * XXX - the BPF interpreter doesn't check for this,
    854 		 * so its behavior is dependent on the behavior of
    855 		 * the processor on which it's running.  There are
    856 		 * processors on which it shifts all the bits out
    857 		 * and processors on which it does no shift.
    858 		 */
    859 		if (b < 32)
    860 			a >>= b;
    861 		else
    862 			a = 0;
    863 		break;
    864 
    865 	default:
    866 		abort();
    867 	}
    868 	s->k = a;
    869 	s->code = BPF_LD|BPF_IMM;
    870 	/*
    871 	 * XXX - optimizer loop detection.
    872 	 */
    873 	opt_state->non_branch_movement_performed = 1;
    874 	opt_state->done = 0;
    875 }
    876 
    877 static inline struct slist *
    878 this_op(struct slist *s)
    879 {
    880 	while (s != 0 && s->s.code == NOP)
    881 		s = s->next;
    882 	return s;
    883 }
    884 
    885 static void
    886 opt_not(struct block *b)
    887 {
    888 	struct block *tmp = JT(b);
    889 
    890 	JT(b) = JF(b);
    891 	JF(b) = tmp;
    892 }
    893 
    894 static void
    895 opt_peep(opt_state_t *opt_state, struct block *b)
    896 {
    897 	struct slist *s;
    898 	struct slist *next, *last;
    899 	bpf_u_int32 val;
    900 
    901 	s = b->stmts;
    902 	if (s == 0)
    903 		return;
    904 
    905 	last = s;
    906 	for (/*empty*/; /*empty*/; s = next) {
    907 		/*
    908 		 * Skip over nops.
    909 		 */
    910 		s = this_op(s);
    911 		if (s == 0)
    912 			break;	/* nothing left in the block */
    913 
    914 		/*
    915 		 * Find the next real instruction after that one
    916 		 * (skipping nops).
    917 		 */
    918 		next = this_op(s->next);
    919 		if (next == 0)
    920 			break;	/* no next instruction */
    921 		last = next;
    922 
    923 		/*
    924 		 * st  M[k]	-->	st  M[k]
    925 		 * ldx M[k]		tax
    926 		 */
    927 		if (s->s.code == BPF_ST &&
    928 		    next->s.code == (BPF_LDX|BPF_MEM) &&
    929 		    s->s.k == next->s.k) {
    930 			/*
    931 			 * XXX - optimizer loop detection.
    932 			 */
    933 			opt_state->non_branch_movement_performed = 1;
    934 			opt_state->done = 0;
    935 			next->s.code = BPF_MISC|BPF_TAX;
    936 		}
    937 		/*
    938 		 * ld  #k	-->	ldx  #k
    939 		 * tax			txa
    940 		 */
    941 		if (s->s.code == (BPF_LD|BPF_IMM) &&
    942 		    next->s.code == (BPF_MISC|BPF_TAX)) {
    943 			s->s.code = BPF_LDX|BPF_IMM;
    944 			next->s.code = BPF_MISC|BPF_TXA;
    945 			/*
    946 			 * XXX - optimizer loop detection.
    947 			 */
    948 			opt_state->non_branch_movement_performed = 1;
    949 			opt_state->done = 0;
    950 		}
    951 		/*
    952 		 * This is an ugly special case, but it happens
    953 		 * when you say tcp[k] or udp[k] where k is a constant.
    954 		 */
    955 		if (s->s.code == (BPF_LD|BPF_IMM)) {
    956 			struct slist *add, *tax, *ild;
    957 
    958 			/*
    959 			 * Check that X isn't used on exit from this
    960 			 * block (which the optimizer might cause).
    961 			 * We know the code generator won't generate
    962 			 * any local dependencies.
    963 			 */
    964 			if (ATOMELEM(b->out_use, X_ATOM))
    965 				continue;
    966 
    967 			/*
    968 			 * Check that the instruction following the ldi
    969 			 * is an addx, or it's an ldxms with an addx
    970 			 * following it (with 0 or more nops between the
    971 			 * ldxms and addx).
    972 			 */
    973 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
    974 				add = next;
    975 			else
    976 				add = this_op(next->next);
    977 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
    978 				continue;
    979 
    980 			/*
    981 			 * Check that a tax follows that (with 0 or more
    982 			 * nops between them).
    983 			 */
    984 			tax = this_op(add->next);
    985 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
    986 				continue;
    987 
    988 			/*
    989 			 * Check that an ild follows that (with 0 or more
    990 			 * nops between them).
    991 			 */
    992 			ild = this_op(tax->next);
    993 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
    994 			    BPF_MODE(ild->s.code) != BPF_IND)
    995 				continue;
    996 			/*
    997 			 * We want to turn this sequence:
    998 			 *
    999 			 * (004) ldi     #0x2		{s}
   1000 			 * (005) ldxms   [14]		{next}  -- optional
   1001 			 * (006) addx			{add}
   1002 			 * (007) tax			{tax}
   1003 			 * (008) ild     [x+0]		{ild}
   1004 			 *
   1005 			 * into this sequence:
   1006 			 *
   1007 			 * (004) nop
   1008 			 * (005) ldxms   [14]
   1009 			 * (006) nop
   1010 			 * (007) nop
   1011 			 * (008) ild     [x+2]
   1012 			 *
   1013 			 * XXX We need to check that X is not
   1014 			 * subsequently used, because we want to change
   1015 			 * what'll be in it after this sequence.
   1016 			 *
   1017 			 * We know we can eliminate the accumulator
   1018 			 * modifications earlier in the sequence since
   1019 			 * it is defined by the last stmt of this sequence
   1020 			 * (i.e., the last statement of the sequence loads
   1021 			 * a value into the accumulator, so we can eliminate
   1022 			 * earlier operations on the accumulator).
   1023 			 */
   1024 			ild->s.k += s->s.k;
   1025 			s->s.code = NOP;
   1026 			add->s.code = NOP;
   1027 			tax->s.code = NOP;
   1028 			/*
   1029 			 * XXX - optimizer loop detection.
   1030 			 */
   1031 			opt_state->non_branch_movement_performed = 1;
   1032 			opt_state->done = 0;
   1033 		}
   1034 	}
   1035 	/*
   1036 	 * If the comparison at the end of a block is an equality
   1037 	 * comparison against a constant, and nobody uses the value
   1038 	 * we leave in the A register at the end of a block, and
   1039 	 * the operation preceding the comparison is an arithmetic
   1040 	 * operation, we can sometime optimize it away.
   1041 	 */
   1042 	if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
   1043 	    !ATOMELEM(b->out_use, A_ATOM)) {
   1044 		/*
   1045 		 * We can optimize away certain subtractions of the
   1046 		 * X register.
   1047 		 */
   1048 		if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
   1049 			val = b->val[X_ATOM];
   1050 			if (opt_state->vmap[val].is_const) {
   1051 				/*
   1052 				 * If we have a subtract to do a comparison,
   1053 				 * and the X register is a known constant,
   1054 				 * we can merge this value into the
   1055 				 * comparison:
   1056 				 *
   1057 				 * sub x  ->	nop
   1058 				 * jeq #y	jeq #(x+y)
   1059 				 */
   1060 				b->s.k += opt_state->vmap[val].const_val;
   1061 				last->s.code = NOP;
   1062 				/*
   1063 				 * XXX - optimizer loop detection.
   1064 				 */
   1065 				opt_state->non_branch_movement_performed = 1;
   1066 				opt_state->done = 0;
   1067 			} else if (b->s.k == 0) {
   1068 				/*
   1069 				 * If the X register isn't a constant,
   1070 				 * and the comparison in the test is
   1071 				 * against 0, we can compare with the
   1072 				 * X register, instead:
   1073 				 *
   1074 				 * sub x  ->	nop
   1075 				 * jeq #0	jeq x
   1076 				 */
   1077 				last->s.code = NOP;
   1078 				b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
   1079 				/*
   1080 				 * XXX - optimizer loop detection.
   1081 				 */
   1082 				opt_state->non_branch_movement_performed = 1;
   1083 				opt_state->done = 0;
   1084 			}
   1085 		}
   1086 		/*
   1087 		 * Likewise, a constant subtract can be simplified:
   1088 		 *
   1089 		 * sub #x ->	nop
   1090 		 * jeq #y ->	jeq #(x+y)
   1091 		 */
   1092 		else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
   1093 			last->s.code = NOP;
   1094 			b->s.k += last->s.k;
   1095 			/*
   1096 			 * XXX - optimizer loop detection.
   1097 			 */
   1098 			opt_state->non_branch_movement_performed = 1;
   1099 			opt_state->done = 0;
   1100 		}
   1101 		/*
   1102 		 * And, similarly, a constant AND can be simplified
   1103 		 * if we're testing against 0, i.e.:
   1104 		 *
   1105 		 * and #k	nop
   1106 		 * jeq #0  ->	jset #k
   1107 		 */
   1108 		else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
   1109 		    b->s.k == 0) {
   1110 			b->s.k = last->s.k;
   1111 			b->s.code = BPF_JMP|BPF_K|BPF_JSET;
   1112 			last->s.code = NOP;
   1113 			/*
   1114 			 * XXX - optimizer loop detection.
   1115 			 */
   1116 			opt_state->non_branch_movement_performed = 1;
   1117 			opt_state->done = 0;
   1118 			opt_not(b);
   1119 		}
   1120 	}
   1121 	/*
   1122 	 * jset #0        ->   never
   1123 	 * jset #ffffffff ->   always
   1124 	 */
   1125 	if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
   1126 		if (b->s.k == 0)
   1127 			JT(b) = JF(b);
   1128 		if (b->s.k == 0xffffffffU)
   1129 			JF(b) = JT(b);
   1130 	}
   1131 	/*
   1132 	 * If we're comparing against the index register, and the index
   1133 	 * register is a known constant, we can just compare against that
   1134 	 * constant.
   1135 	 */
   1136 	val = b->val[X_ATOM];
   1137 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
   1138 		bpf_u_int32 v = opt_state->vmap[val].const_val;
   1139 		b->s.code &= ~BPF_X;
   1140 		b->s.k = v;
   1141 	}
   1142 	/*
   1143 	 * If the accumulator is a known constant, we can compute the
   1144 	 * comparison result.
   1145 	 */
   1146 	val = b->val[A_ATOM];
   1147 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
   1148 		bpf_u_int32 v = opt_state->vmap[val].const_val;
   1149 		switch (BPF_OP(b->s.code)) {
   1150 
   1151 		case BPF_JEQ:
   1152 			v = v == b->s.k;
   1153 			break;
   1154 
   1155 		case BPF_JGT:
   1156 			v = v > b->s.k;
   1157 			break;
   1158 
   1159 		case BPF_JGE:
   1160 			v = v >= b->s.k;
   1161 			break;
   1162 
   1163 		case BPF_JSET:
   1164 			v &= b->s.k;
   1165 			break;
   1166 
   1167 		default:
   1168 			abort();
   1169 		}
   1170 		if (JF(b) != JT(b)) {
   1171 			/*
   1172 			 * XXX - optimizer loop detection.
   1173 			 */
   1174 			opt_state->non_branch_movement_performed = 1;
   1175 			opt_state->done = 0;
   1176 		}
   1177 		if (v)
   1178 			JF(b) = JT(b);
   1179 		else
   1180 			JT(b) = JF(b);
   1181 	}
   1182 }
   1183 
   1184 /*
   1185  * Compute the symbolic value of expression of 's', and update
   1186  * anything it defines in the value table 'val'.  If 'alter' is true,
   1187  * do various optimizations.  This code would be cleaner if symbolic
   1188  * evaluation and code transformations weren't folded together.
   1189  */
   1190 static void
   1191 opt_stmt(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 val[], int alter)
   1192 {
   1193 	int op;
   1194 	bpf_u_int32 v;
   1195 
   1196 	switch (s->code) {
   1197 
   1198 	case BPF_LD|BPF_ABS|BPF_W:
   1199 	case BPF_LD|BPF_ABS|BPF_H:
   1200 	case BPF_LD|BPF_ABS|BPF_B:
   1201 		v = F(opt_state, s->code, s->k, 0L);
   1202 		vstore(s, &val[A_ATOM], v, alter);
   1203 		break;
   1204 
   1205 	case BPF_LD|BPF_IND|BPF_W:
   1206 	case BPF_LD|BPF_IND|BPF_H:
   1207 	case BPF_LD|BPF_IND|BPF_B:
   1208 		v = val[X_ATOM];
   1209 		if (alter && opt_state->vmap[v].is_const) {
   1210 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
   1211 			s->k += opt_state->vmap[v].const_val;
   1212 			v = F(opt_state, s->code, s->k, 0L);
   1213 			/*
   1214 			 * XXX - optimizer loop detection.
   1215 			 */
   1216 			opt_state->non_branch_movement_performed = 1;
   1217 			opt_state->done = 0;
   1218 		}
   1219 		else
   1220 			v = F(opt_state, s->code, s->k, v);
   1221 		vstore(s, &val[A_ATOM], v, alter);
   1222 		break;
   1223 
   1224 	case BPF_LD|BPF_LEN:
   1225 		v = F(opt_state, s->code, 0L, 0L);
   1226 		vstore(s, &val[A_ATOM], v, alter);
   1227 		break;
   1228 
   1229 	case BPF_LD|BPF_IMM:
   1230 		v = K(s->k);
   1231 		vstore(s, &val[A_ATOM], v, alter);
   1232 		break;
   1233 
   1234 	case BPF_LDX|BPF_IMM:
   1235 		v = K(s->k);
   1236 		vstore(s, &val[X_ATOM], v, alter);
   1237 		break;
   1238 
   1239 	case BPF_LDX|BPF_MSH|BPF_B:
   1240 		v = F(opt_state, s->code, s->k, 0L);
   1241 		vstore(s, &val[X_ATOM], v, alter);
   1242 		break;
   1243 
   1244 	case BPF_ALU|BPF_NEG:
   1245 		if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
   1246 			s->code = BPF_LD|BPF_IMM;
   1247 			/*
   1248 			 * Do this negation as unsigned arithmetic; that's
   1249 			 * what modern BPF engines do, and it guarantees
   1250 			 * that all possible values can be negated.  (Yeah,
   1251 			 * negating 0x80000000, the minimum signed 32-bit
   1252 			 * two's-complement value, results in 0x80000000,
   1253 			 * so it's still negative, but we *should* be doing
   1254 			 * all unsigned arithmetic here, to match what
   1255 			 * modern BPF engines do.)
   1256 			 *
   1257 			 * Express it as 0U - (unsigned value) so that we
   1258 			 * don't get compiler warnings about negating an
   1259 			 * unsigned value and don't get UBSan warnings
   1260 			 * about the result of negating 0x80000000 being
   1261 			 * undefined.
   1262 			 */
   1263 			s->k = 0U - opt_state->vmap[val[A_ATOM]].const_val;
   1264 			val[A_ATOM] = K(s->k);
   1265 		}
   1266 		else
   1267 			val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
   1268 		break;
   1269 
   1270 	case BPF_ALU|BPF_ADD|BPF_K:
   1271 	case BPF_ALU|BPF_SUB|BPF_K:
   1272 	case BPF_ALU|BPF_MUL|BPF_K:
   1273 	case BPF_ALU|BPF_DIV|BPF_K:
   1274 	case BPF_ALU|BPF_MOD|BPF_K:
   1275 	case BPF_ALU|BPF_AND|BPF_K:
   1276 	case BPF_ALU|BPF_OR|BPF_K:
   1277 	case BPF_ALU|BPF_XOR|BPF_K:
   1278 	case BPF_ALU|BPF_LSH|BPF_K:
   1279 	case BPF_ALU|BPF_RSH|BPF_K:
   1280 		op = BPF_OP(s->code);
   1281 		if (alter) {
   1282 			if (s->k == 0) {
   1283 				/*
   1284 				 * Optimize operations where the constant
   1285 				 * is zero.
   1286 				 *
   1287 				 * Don't optimize away "sub #0"
   1288 				 * as it may be needed later to
   1289 				 * fixup the generated math code.
   1290 				 *
   1291 				 * Fail if we're dividing by zero or taking
   1292 				 * a modulus by zero.
   1293 				 */
   1294 				if (op == BPF_ADD ||
   1295 				    op == BPF_LSH || op == BPF_RSH ||
   1296 				    op == BPF_OR || op == BPF_XOR) {
   1297 					s->code = NOP;
   1298 					break;
   1299 				}
   1300 				if (op == BPF_MUL || op == BPF_AND) {
   1301 					s->code = BPF_LD|BPF_IMM;
   1302 					val[A_ATOM] = K(s->k);
   1303 					break;
   1304 				}
   1305 				if (op == BPF_DIV)
   1306 					opt_error(opt_state,
   1307 					    "division by zero");
   1308 				if (op == BPF_MOD)
   1309 					opt_error(opt_state,
   1310 					    "modulus by zero");
   1311 			}
   1312 			if (opt_state->vmap[val[A_ATOM]].is_const) {
   1313 				fold_op(opt_state, s, val[A_ATOM], K(s->k));
   1314 				val[A_ATOM] = K(s->k);
   1315 				break;
   1316 			}
   1317 		}
   1318 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
   1319 		break;
   1320 
   1321 	case BPF_ALU|BPF_ADD|BPF_X:
   1322 	case BPF_ALU|BPF_SUB|BPF_X:
   1323 	case BPF_ALU|BPF_MUL|BPF_X:
   1324 	case BPF_ALU|BPF_DIV|BPF_X:
   1325 	case BPF_ALU|BPF_MOD|BPF_X:
   1326 	case BPF_ALU|BPF_AND|BPF_X:
   1327 	case BPF_ALU|BPF_OR|BPF_X:
   1328 	case BPF_ALU|BPF_XOR|BPF_X:
   1329 	case BPF_ALU|BPF_LSH|BPF_X:
   1330 	case BPF_ALU|BPF_RSH|BPF_X:
   1331 		op = BPF_OP(s->code);
   1332 		if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
   1333 			if (opt_state->vmap[val[A_ATOM]].is_const) {
   1334 				fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
   1335 				val[A_ATOM] = K(s->k);
   1336 			}
   1337 			else {
   1338 				s->code = BPF_ALU|BPF_K|op;
   1339 				s->k = opt_state->vmap[val[X_ATOM]].const_val;
   1340 				if ((op == BPF_LSH || op == BPF_RSH) &&
   1341 				    s->k > 31)
   1342 					opt_error(opt_state,
   1343 					    "shift by more than 31 bits");
   1344 				/*
   1345 				 * XXX - optimizer loop detection.
   1346 				 */
   1347 				opt_state->non_branch_movement_performed = 1;
   1348 				opt_state->done = 0;
   1349 				val[A_ATOM] =
   1350 					F(opt_state, s->code, val[A_ATOM], K(s->k));
   1351 			}
   1352 			break;
   1353 		}
   1354 		/*
   1355 		 * Check if we're doing something to an accumulator
   1356 		 * that is 0, and simplify.  This may not seem like
   1357 		 * much of a simplification but it could open up further
   1358 		 * optimizations.
   1359 		 * XXX We could also check for mul by 1, etc.
   1360 		 */
   1361 		if (alter && opt_state->vmap[val[A_ATOM]].is_const
   1362 		    && opt_state->vmap[val[A_ATOM]].const_val == 0) {
   1363 			if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
   1364 				s->code = BPF_MISC|BPF_TXA;
   1365 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
   1366 				break;
   1367 			}
   1368 			else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
   1369 				 op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
   1370 				s->code = BPF_LD|BPF_IMM;
   1371 				s->k = 0;
   1372 				vstore(s, &val[A_ATOM], K(s->k), alter);
   1373 				break;
   1374 			}
   1375 			else if (op == BPF_NEG) {
   1376 				s->code = NOP;
   1377 				break;
   1378 			}
   1379 		}
   1380 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
   1381 		break;
   1382 
   1383 	case BPF_MISC|BPF_TXA:
   1384 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
   1385 		break;
   1386 
   1387 	case BPF_LD|BPF_MEM:
   1388 		v = val[s->k];
   1389 		if (alter && opt_state->vmap[v].is_const) {
   1390 			s->code = BPF_LD|BPF_IMM;
   1391 			s->k = opt_state->vmap[v].const_val;
   1392 			/*
   1393 			 * XXX - optimizer loop detection.
   1394 			 */
   1395 			opt_state->non_branch_movement_performed = 1;
   1396 			opt_state->done = 0;
   1397 		}
   1398 		vstore(s, &val[A_ATOM], v, alter);
   1399 		break;
   1400 
   1401 	case BPF_MISC|BPF_TAX:
   1402 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
   1403 		break;
   1404 
   1405 	case BPF_LDX|BPF_MEM:
   1406 		v = val[s->k];
   1407 		if (alter && opt_state->vmap[v].is_const) {
   1408 			s->code = BPF_LDX|BPF_IMM;
   1409 			s->k = opt_state->vmap[v].const_val;
   1410 			/*
   1411 			 * XXX - optimizer loop detection.
   1412 			 */
   1413 			opt_state->non_branch_movement_performed = 1;
   1414 			opt_state->done = 0;
   1415 		}
   1416 		vstore(s, &val[X_ATOM], v, alter);
   1417 		break;
   1418 
   1419 	case BPF_ST:
   1420 		vstore(s, &val[s->k], val[A_ATOM], alter);
   1421 		break;
   1422 
   1423 	case BPF_STX:
   1424 		vstore(s, &val[s->k], val[X_ATOM], alter);
   1425 		break;
   1426 	}
   1427 }
   1428 
   1429 static void
   1430 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
   1431 {
   1432 	register int atom;
   1433 
   1434 	atom = atomuse(s);
   1435 	if (atom >= 0) {
   1436 		if (atom == AX_ATOM) {
   1437 			last[X_ATOM] = 0;
   1438 			last[A_ATOM] = 0;
   1439 		}
   1440 		else
   1441 			last[atom] = 0;
   1442 	}
   1443 	atom = atomdef(s);
   1444 	if (atom >= 0) {
   1445 		if (last[atom]) {
   1446 			/*
   1447 			 * XXX - optimizer loop detection.
   1448 			 */
   1449 			opt_state->non_branch_movement_performed = 1;
   1450 			opt_state->done = 0;
   1451 			last[atom]->code = NOP;
   1452 		}
   1453 		last[atom] = s;
   1454 	}
   1455 }
   1456 
   1457 static void
   1458 opt_deadstores(opt_state_t *opt_state, register struct block *b)
   1459 {
   1460 	register struct slist *s;
   1461 	register int atom;
   1462 	struct stmt *last[N_ATOMS];
   1463 
   1464 	memset((char *)last, 0, sizeof last);
   1465 
   1466 	for (s = b->stmts; s != 0; s = s->next)
   1467 		deadstmt(opt_state, &s->s, last);
   1468 	deadstmt(opt_state, &b->s, last);
   1469 
   1470 	for (atom = 0; atom < N_ATOMS; ++atom)
   1471 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
   1472 			last[atom]->code = NOP;
   1473 			/*
   1474 			 * The store was removed as it's dead,
   1475 			 * so the value stored into now has
   1476 			 * an unknown value.
   1477 			 */
   1478 			vstore(0, &b->val[atom], VAL_UNKNOWN, 0);
   1479 			/*
   1480 			 * XXX - optimizer loop detection.
   1481 			 */
   1482 			opt_state->non_branch_movement_performed = 1;
   1483 			opt_state->done = 0;
   1484 		}
   1485 }
   1486 
   1487 static void
   1488 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
   1489 {
   1490 	struct slist *s;
   1491 	struct edge *p;
   1492 	int i;
   1493 	bpf_u_int32 aval, xval;
   1494 
   1495 #if 0
   1496 	for (s = b->stmts; s && s->next; s = s->next)
   1497 		if (BPF_CLASS(s->s.code) == BPF_JMP) {
   1498 			do_stmts = 0;
   1499 			break;
   1500 		}
   1501 #endif
   1502 
   1503 	/*
   1504 	 * Initialize the atom values.
   1505 	 */
   1506 	p = b->in_edges;
   1507 	if (p == 0) {
   1508 		/*
   1509 		 * We have no predecessors, so everything is undefined
   1510 		 * upon entry to this block.
   1511 		 */
   1512 		memset((char *)b->val, 0, sizeof(b->val));
   1513 	} else {
   1514 		/*
   1515 		 * Inherit values from our predecessors.
   1516 		 *
   1517 		 * First, get the values from the predecessor along the
   1518 		 * first edge leading to this node.
   1519 		 */
   1520 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
   1521 		/*
   1522 		 * Now look at all the other nodes leading to this node.
   1523 		 * If, for the predecessor along that edge, a register
   1524 		 * has a different value from the one we have (i.e.,
   1525 		 * control paths are merging, and the merging paths
   1526 		 * assign different values to that register), give the
   1527 		 * register the undefined value of 0.
   1528 		 */
   1529 		while ((p = p->next) != NULL) {
   1530 			for (i = 0; i < N_ATOMS; ++i)
   1531 				if (b->val[i] != p->pred->val[i])
   1532 					b->val[i] = 0;
   1533 		}
   1534 	}
   1535 	aval = b->val[A_ATOM];
   1536 	xval = b->val[X_ATOM];
   1537 	for (s = b->stmts; s; s = s->next)
   1538 		opt_stmt(opt_state, &s->s, b->val, do_stmts);
   1539 
   1540 	/*
   1541 	 * This is a special case: if we don't use anything from this
   1542 	 * block, and we load the accumulator or index register with a
   1543 	 * value that is already there, or if this block is a return,
   1544 	 * eliminate all the statements.
   1545 	 *
   1546 	 * XXX - what if it does a store?  Presumably that falls under
   1547 	 * the heading of "if we don't use anything from this block",
   1548 	 * i.e., if we use any memory location set to a different
   1549 	 * value by this block, then we use something from this block.
   1550 	 *
   1551 	 * XXX - why does it matter whether we use anything from this
   1552 	 * block?  If the accumulator or index register doesn't change
   1553 	 * its value, isn't that OK even if we use that value?
   1554 	 *
   1555 	 * XXX - if we load the accumulator with a different value,
   1556 	 * and the block ends with a conditional branch, we obviously
   1557 	 * can't eliminate it, as the branch depends on that value.
   1558 	 * For the index register, the conditional branch only depends
   1559 	 * on the index register value if the test is against the index
   1560 	 * register value rather than a constant; if nothing uses the
   1561 	 * value we put into the index register, and we're not testing
   1562 	 * against the index register's value, and there aren't any
   1563 	 * other problems that would keep us from eliminating this
   1564 	 * block, can we eliminate it?
   1565 	 */
   1566 	if (do_stmts &&
   1567 	    ((b->out_use == 0 &&
   1568 	      aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
   1569 	      xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
   1570 	     BPF_CLASS(b->s.code) == BPF_RET)) {
   1571 		if (b->stmts != 0) {
   1572 			b->stmts = 0;
   1573 			/*
   1574 			 * XXX - optimizer loop detection.
   1575 			 */
   1576 			opt_state->non_branch_movement_performed = 1;
   1577 			opt_state->done = 0;
   1578 		}
   1579 	} else {
   1580 		opt_peep(opt_state, b);
   1581 		opt_deadstores(opt_state, b);
   1582 	}
   1583 	/*
   1584 	 * Set up values for branch optimizer.
   1585 	 */
   1586 	if (BPF_SRC(b->s.code) == BPF_K)
   1587 		b->oval = K(b->s.k);
   1588 	else
   1589 		b->oval = b->val[X_ATOM];
   1590 	b->et.code = b->s.code;
   1591 	b->ef.code = -b->s.code;
   1592 }
   1593 
   1594 /*
   1595  * Return true if any register that is used on exit from 'succ', has
   1596  * an exit value that is different from the corresponding exit value
   1597  * from 'b'.
   1598  */
   1599 static int
   1600 use_conflict(struct block *b, struct block *succ)
   1601 {
   1602 	int atom;
   1603 	atomset use = succ->out_use;
   1604 
   1605 	if (use == 0)
   1606 		return 0;
   1607 
   1608 	for (atom = 0; atom < N_ATOMS; ++atom)
   1609 		if (ATOMELEM(use, atom))
   1610 			if (b->val[atom] != succ->val[atom])
   1611 				return 1;
   1612 	return 0;
   1613 }
   1614 
   1615 /*
   1616  * Given a block that is the successor of an edge, and an edge that
   1617  * dominates that edge, return either a pointer to a child of that
   1618  * block (a block to which that block jumps) if that block is a
   1619  * candidate to replace the successor of the latter edge or NULL
   1620  * if neither of the children of the first block are candidates.
   1621  */
   1622 static struct block *
   1623 fold_edge(struct block *child, struct edge *ep)
   1624 {
   1625 	int sense;
   1626 	bpf_u_int32 aval0, aval1, oval0, oval1;
   1627 	int code = ep->code;
   1628 
   1629 	if (code < 0) {
   1630 		/*
   1631 		 * This edge is a "branch if false" edge.
   1632 		 */
   1633 		code = -code;
   1634 		sense = 0;
   1635 	} else {
   1636 		/*
   1637 		 * This edge is a "branch if true" edge.
   1638 		 */
   1639 		sense = 1;
   1640 	}
   1641 
   1642 	/*
   1643 	 * If the opcode for the branch at the end of the block we
   1644 	 * were handed isn't the same as the opcode for the branch
   1645 	 * to which the edge we were handed corresponds, the tests
   1646 	 * for those branches aren't testing the same conditions,
   1647 	 * so the blocks to which the first block branches aren't
   1648 	 * candidates to replace the successor of the edge.
   1649 	 */
   1650 	if (child->s.code != code)
   1651 		return 0;
   1652 
   1653 	aval0 = child->val[A_ATOM];
   1654 	oval0 = child->oval;
   1655 	aval1 = ep->pred->val[A_ATOM];
   1656 	oval1 = ep->pred->oval;
   1657 
   1658 	/*
   1659 	 * If the A register value on exit from the successor block
   1660 	 * isn't the same as the A register value on exit from the
   1661 	 * predecessor of the edge, the blocks to which the first
   1662 	 * block branches aren't candidates to replace the successor
   1663 	 * of the edge.
   1664 	 */
   1665 	if (aval0 != aval1)
   1666 		return 0;
   1667 
   1668 	if (oval0 == oval1)
   1669 		/*
   1670 		 * The operands of the branch instructions are
   1671 		 * identical, so the branches are testing the
   1672 		 * same condition, and the result is true if a true
   1673 		 * branch was taken to get here, otherwise false.
   1674 		 */
   1675 		return sense ? JT(child) : JF(child);
   1676 
   1677 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
   1678 		/*
   1679 		 * At this point, we only know the comparison if we
   1680 		 * came down the true branch, and it was an equality
   1681 		 * comparison with a constant.
   1682 		 *
   1683 		 * I.e., if we came down the true branch, and the branch
   1684 		 * was an equality comparison with a constant, we know the
   1685 		 * accumulator contains that constant.  If we came down
   1686 		 * the false branch, or the comparison wasn't with a
   1687 		 * constant, we don't know what was in the accumulator.
   1688 		 *
   1689 		 * We rely on the fact that distinct constants have distinct
   1690 		 * value numbers.
   1691 		 */
   1692 		return JF(child);
   1693 
   1694 	return 0;
   1695 }
   1696 
   1697 /*
   1698  * If we can make this edge go directly to a child of the edge's current
   1699  * successor, do so.
   1700  */
   1701 static void
   1702 opt_j(opt_state_t *opt_state, struct edge *ep)
   1703 {
   1704 	register u_int i, k;
   1705 	register struct block *target;
   1706 
   1707 	/*
   1708 	 * Does this edge go to a block where, if the test
   1709 	 * at the end of it succeeds, it goes to a block
   1710 	 * that's a leaf node of the DAG, i.e. a return
   1711 	 * statement?
   1712 	 * If so, there's nothing to optimize.
   1713 	 */
   1714 	if (JT(ep->succ) == 0)
   1715 		return;
   1716 
   1717 	/*
   1718 	 * Does this edge go to a block that goes, in turn, to
   1719 	 * the same block regardless of whether the test at the
   1720 	 * end succeeds or fails?
   1721 	 */
   1722 	if (JT(ep->succ) == JF(ep->succ)) {
   1723 		/*
   1724 		 * Common branch targets can be eliminated, provided
   1725 		 * there is no data dependency.
   1726 		 *
   1727 		 * Check whether any register used on exit from the
   1728 		 * block to which the successor of this edge goes
   1729 		 * has a value at that point that's different from
   1730 		 * the value it has on exit from the predecessor of
   1731 		 * this edge.  If not, the predecessor of this edge
   1732 		 * can just go to the block to which the successor
   1733 		 * of this edge goes, bypassing the successor of this
   1734 		 * edge, as the successor of this edge isn't doing
   1735 		 * any calculations whose results are different
   1736 		 * from what the blocks before it did and isn't
   1737 		 * doing any tests the results of which matter.
   1738 		 */
   1739 		if (!use_conflict(ep->pred, JT(ep->succ))) {
   1740 			/*
   1741 			 * No, there isn't.
   1742 			 * Make this edge go to the block to
   1743 			 * which the successor of that edge
   1744 			 * goes.
   1745 			 *
   1746 			 * XXX - optimizer loop detection.
   1747 			 */
   1748 			opt_state->non_branch_movement_performed = 1;
   1749 			opt_state->done = 0;
   1750 			ep->succ = JT(ep->succ);
   1751 		}
   1752 	}
   1753 	/*
   1754 	 * For each edge dominator that matches the successor of this
   1755 	 * edge, promote the edge successor to the its grandchild.
   1756 	 *
   1757 	 * XXX We violate the set abstraction here in favor a reasonably
   1758 	 * efficient loop.
   1759 	 */
   1760  top:
   1761 	for (i = 0; i < opt_state->edgewords; ++i) {
   1762 		/* i'th word in the bitset of dominators */
   1763 		register bpf_u_int32 x = ep->edom[i];
   1764 
   1765 		while (x != 0) {
   1766 			/* Find the next dominator in that word and mark it as found */
   1767 			k = lowest_set_bit(x);
   1768 			x &=~ ((bpf_u_int32)1 << k);
   1769 			k += i * BITS_PER_WORD;
   1770 
   1771 			target = fold_edge(ep->succ, opt_state->edges[k]);
   1772 			/*
   1773 			 * We have a candidate to replace the successor
   1774 			 * of ep.
   1775 			 *
   1776 			 * Check that there is no data dependency between
   1777 			 * nodes that will be violated if we move the edge;
   1778 			 * i.e., if any register used on exit from the
   1779 			 * candidate has a value at that point different
   1780 			 * from the value it has when we exit the
   1781 			 * predecessor of that edge, there's a data
   1782 			 * dependency that will be violated.
   1783 			 */
   1784 			if (target != 0 && !use_conflict(ep->pred, target)) {
   1785 				/*
   1786 				 * It's safe to replace the successor of
   1787 				 * ep; do so, and note that we've made
   1788 				 * at least one change.
   1789 				 *
   1790 				 * XXX - this is one of the operations that
   1791 				 * happens when the optimizer gets into
   1792 				 * one of those infinite loops.
   1793 				 */
   1794 				opt_state->done = 0;
   1795 				ep->succ = target;
   1796 				if (JT(target) != 0)
   1797 					/*
   1798 					 * Start over unless we hit a leaf.
   1799 					 */
   1800 					goto top;
   1801 				return;
   1802 			}
   1803 		}
   1804 	}
   1805 }
   1806 
   1807 /*
   1808  * XXX - is this, and and_pullup(), what's described in section 6.1.2
   1809  * "Predicate Assertion Propagation" in the BPF+ paper?
   1810  *
   1811  * Note that this looks at block dominators, not edge dominators.
   1812  * Don't think so.
   1813  *
   1814  * "A or B" compiles into
   1815  *
   1816  *          A
   1817  *       t / \ f
   1818  *        /   B
   1819  *       / t / \ f
   1820  *      \   /
   1821  *       \ /
   1822  *        X
   1823  *
   1824  *
   1825  */
   1826 static void
   1827 or_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
   1828 {
   1829 	bpf_u_int32 val;
   1830 	int at_top;
   1831 	struct block *pull;
   1832 	struct block **diffp, **samep;
   1833 	struct edge *ep;
   1834 
   1835 	ep = b->in_edges;
   1836 	if (ep == 0)
   1837 		return;
   1838 
   1839 	/*
   1840 	 * Make sure each predecessor loads the same value.
   1841 	 * XXX why?
   1842 	 */
   1843 	val = ep->pred->val[A_ATOM];
   1844 	for (ep = ep->next; ep != 0; ep = ep->next)
   1845 		if (val != ep->pred->val[A_ATOM])
   1846 			return;
   1847 
   1848 	/*
   1849 	 * For the first edge in the list of edges coming into this block,
   1850 	 * see whether the predecessor of that edge comes here via a true
   1851 	 * branch or a false branch.
   1852 	 */
   1853 	if (JT(b->in_edges->pred) == b)
   1854 		diffp = &JT(b->in_edges->pred);	/* jt */
   1855 	else
   1856 		diffp = &JF(b->in_edges->pred);	/* jf */
   1857 
   1858 	/*
   1859 	 * diffp is a pointer to a pointer to the block.
   1860 	 *
   1861 	 * Go down the false chain looking as far as you can,
   1862 	 * making sure that each jump-compare is doing the
   1863 	 * same as the original block.
   1864 	 *
   1865 	 * If you reach the bottom before you reach a
   1866 	 * different jump-compare, just exit.  There's nothing
   1867 	 * to do here.  XXX - no, this version is checking for
   1868 	 * the value leaving the block; that's from the BPF+
   1869 	 * pullup routine.
   1870 	 */
   1871 	at_top = 1;
   1872 	for (;;) {
   1873 		/*
   1874 		 * Done if that's not going anywhere XXX
   1875 		 */
   1876 		if (*diffp == 0)
   1877 			return;
   1878 
   1879 		/*
   1880 		 * Done if that predecessor blah blah blah isn't
   1881 		 * going the same place we're going XXX
   1882 		 *
   1883 		 * Does the true edge of this block point to the same
   1884 		 * location as the true edge of b?
   1885 		 */
   1886 		if (JT(*diffp) != JT(b))
   1887 			return;
   1888 
   1889 		/*
   1890 		 * Done if this node isn't a dominator of that
   1891 		 * node blah blah blah XXX
   1892 		 *
   1893 		 * Does b dominate diffp?
   1894 		 */
   1895 		if (!SET_MEMBER((*diffp)->dom, b->id))
   1896 			return;
   1897 
   1898 		/*
   1899 		 * Break out of the loop if that node's value of A
   1900 		 * isn't the value of A above XXX
   1901 		 */
   1902 		if ((*diffp)->val[A_ATOM] != val)
   1903 			break;
   1904 
   1905 		/*
   1906 		 * Get the JF for that node XXX
   1907 		 * Go down the false path.
   1908 		 */
   1909 		diffp = &JF(*diffp);
   1910 		at_top = 0;
   1911 	}
   1912 
   1913 	/*
   1914 	 * Now that we've found a different jump-compare in a chain
   1915 	 * below b, search further down until we find another
   1916 	 * jump-compare that looks at the original value.  This
   1917 	 * jump-compare should get pulled up.  XXX again we're
   1918 	 * comparing values not jump-compares.
   1919 	 */
   1920 	samep = &JF(*diffp);
   1921 	for (;;) {
   1922 		/*
   1923 		 * Done if that's not going anywhere XXX
   1924 		 */
   1925 		if (*samep == 0)
   1926 			return;
   1927 
   1928 		/*
   1929 		 * Done if that predecessor blah blah blah isn't
   1930 		 * going the same place we're going XXX
   1931 		 */
   1932 		if (JT(*samep) != JT(b))
   1933 			return;
   1934 
   1935 		/*
   1936 		 * Done if this node isn't a dominator of that
   1937 		 * node blah blah blah XXX
   1938 		 *
   1939 		 * Does b dominate samep?
   1940 		 */
   1941 		if (!SET_MEMBER((*samep)->dom, b->id))
   1942 			return;
   1943 
   1944 		/*
   1945 		 * Break out of the loop if that node's value of A
   1946 		 * is the value of A above XXX
   1947 		 */
   1948 		if ((*samep)->val[A_ATOM] == val)
   1949 			break;
   1950 
   1951 		/* XXX Need to check that there are no data dependencies
   1952 		   between dp0 and dp1.  Currently, the code generator
   1953 		   will not produce such dependencies. */
   1954 		samep = &JF(*samep);
   1955 	}
   1956 #ifdef notdef
   1957 	/* XXX This doesn't cover everything. */
   1958 	for (i = 0; i < N_ATOMS; ++i)
   1959 		if ((*samep)->val[i] != pred->val[i])
   1960 			return;
   1961 #endif
   1962 	/* Pull up the node. */
   1963 	pull = *samep;
   1964 	*samep = JF(pull);
   1965 	JF(pull) = *diffp;
   1966 
   1967 	/*
   1968 	 * At the top of the chain, each predecessor needs to point at the
   1969 	 * pulled up node.  Inside the chain, there is only one predecessor
   1970 	 * to worry about.
   1971 	 */
   1972 	if (at_top) {
   1973 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
   1974 			if (JT(ep->pred) == b)
   1975 				JT(ep->pred) = pull;
   1976 			else
   1977 				JF(ep->pred) = pull;
   1978 		}
   1979 	}
   1980 	else
   1981 		*diffp = pull;
   1982 
   1983 	/*
   1984 	 * XXX - this is one of the operations that happens when the
   1985 	 * optimizer gets into one of those infinite loops.
   1986 	 */
   1987 	opt_state->done = 0;
   1988 
   1989 	/*
   1990 	 * Recompute dominator sets as control flow graph has changed.
   1991 	 */
   1992 	find_dom(opt_state, root);
   1993 }
   1994 
   1995 static void
   1996 and_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
   1997 {
   1998 	bpf_u_int32 val;
   1999 	int at_top;
   2000 	struct block *pull;
   2001 	struct block **diffp, **samep;
   2002 	struct edge *ep;
   2003 
   2004 	ep = b->in_edges;
   2005 	if (ep == 0)
   2006 		return;
   2007 
   2008 	/*
   2009 	 * Make sure each predecessor loads the same value.
   2010 	 */
   2011 	val = ep->pred->val[A_ATOM];
   2012 	for (ep = ep->next; ep != 0; ep = ep->next)
   2013 		if (val != ep->pred->val[A_ATOM])
   2014 			return;
   2015 
   2016 	if (JT(b->in_edges->pred) == b)
   2017 		diffp = &JT(b->in_edges->pred);
   2018 	else
   2019 		diffp = &JF(b->in_edges->pred);
   2020 
   2021 	at_top = 1;
   2022 	for (;;) {
   2023 		if (*diffp == 0)
   2024 			return;
   2025 
   2026 		if (JF(*diffp) != JF(b))
   2027 			return;
   2028 
   2029 		if (!SET_MEMBER((*diffp)->dom, b->id))
   2030 			return;
   2031 
   2032 		if ((*diffp)->val[A_ATOM] != val)
   2033 			break;
   2034 
   2035 		diffp = &JT(*diffp);
   2036 		at_top = 0;
   2037 	}
   2038 	samep = &JT(*diffp);
   2039 	for (;;) {
   2040 		if (*samep == 0)
   2041 			return;
   2042 
   2043 		if (JF(*samep) != JF(b))
   2044 			return;
   2045 
   2046 		if (!SET_MEMBER((*samep)->dom, b->id))
   2047 			return;
   2048 
   2049 		if ((*samep)->val[A_ATOM] == val)
   2050 			break;
   2051 
   2052 		/* XXX Need to check that there are no data dependencies
   2053 		   between diffp and samep.  Currently, the code generator
   2054 		   will not produce such dependencies. */
   2055 		samep = &JT(*samep);
   2056 	}
   2057 #ifdef notdef
   2058 	/* XXX This doesn't cover everything. */
   2059 	for (i = 0; i < N_ATOMS; ++i)
   2060 		if ((*samep)->val[i] != pred->val[i])
   2061 			return;
   2062 #endif
   2063 	/* Pull up the node. */
   2064 	pull = *samep;
   2065 	*samep = JT(pull);
   2066 	JT(pull) = *diffp;
   2067 
   2068 	/*
   2069 	 * At the top of the chain, each predecessor needs to point at the
   2070 	 * pulled up node.  Inside the chain, there is only one predecessor
   2071 	 * to worry about.
   2072 	 */
   2073 	if (at_top) {
   2074 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
   2075 			if (JT(ep->pred) == b)
   2076 				JT(ep->pred) = pull;
   2077 			else
   2078 				JF(ep->pred) = pull;
   2079 		}
   2080 	}
   2081 	else
   2082 		*diffp = pull;
   2083 
   2084 	/*
   2085 	 * XXX - this is one of the operations that happens when the
   2086 	 * optimizer gets into one of those infinite loops.
   2087 	 */
   2088 	opt_state->done = 0;
   2089 
   2090 	/*
   2091 	 * Recompute dominator sets as control flow graph has changed.
   2092 	 */
   2093 	find_dom(opt_state, root);
   2094 }
   2095 
   2096 static void
   2097 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
   2098 {
   2099 	int i, maxlevel;
   2100 	struct block *p;
   2101 
   2102 	init_val(opt_state);
   2103 	maxlevel = ic->root->level;
   2104 
   2105 	find_inedges(opt_state, ic->root);
   2106 	for (i = maxlevel; i >= 0; --i)
   2107 		for (p = opt_state->levels[i]; p; p = p->link)
   2108 			opt_blk(opt_state, p, do_stmts);
   2109 
   2110 	if (do_stmts)
   2111 		/*
   2112 		 * No point trying to move branches; it can't possibly
   2113 		 * make a difference at this point.
   2114 		 *
   2115 		 * XXX - this might be after we detect a loop where
   2116 		 * we were just looping infinitely moving branches
   2117 		 * in such a fashion that we went through two or more
   2118 		 * versions of the machine code, eventually returning
   2119 		 * to the first version.  (We're really not doing a
   2120 		 * full loop detection, we're just testing for two
   2121 		 * passes in a row where we do nothing but
   2122 		 * move branches.)
   2123 		 */
   2124 		return;
   2125 
   2126 	/*
   2127 	 * Is this what the BPF+ paper describes in sections 6.1.1,
   2128 	 * 6.1.2, and 6.1.3?
   2129 	 */
   2130 	for (i = 1; i <= maxlevel; ++i) {
   2131 		for (p = opt_state->levels[i]; p; p = p->link) {
   2132 			opt_j(opt_state, &p->et);
   2133 			opt_j(opt_state, &p->ef);
   2134 		}
   2135 	}
   2136 
   2137 	find_inedges(opt_state, ic->root);
   2138 	for (i = 1; i <= maxlevel; ++i) {
   2139 		for (p = opt_state->levels[i]; p; p = p->link) {
   2140 			or_pullup(opt_state, p, ic->root);
   2141 			and_pullup(opt_state, p, ic->root);
   2142 		}
   2143 	}
   2144 }
   2145 
   2146 static inline void
   2147 link_inedge(struct edge *parent, struct block *child)
   2148 {
   2149 	parent->next = child->in_edges;
   2150 	child->in_edges = parent;
   2151 }
   2152 
   2153 static void
   2154 find_inedges(opt_state_t *opt_state, struct block *root)
   2155 {
   2156 	u_int i;
   2157 	int level;
   2158 	struct block *b;
   2159 
   2160 	for (i = 0; i < opt_state->n_blocks; ++i)
   2161 		opt_state->blocks[i]->in_edges = 0;
   2162 
   2163 	/*
   2164 	 * Traverse the graph, adding each edge to the predecessor
   2165 	 * list of its successors.  Skip the leaves (i.e. level 0).
   2166 	 */
   2167 	for (level = root->level; level > 0; --level) {
   2168 		for (b = opt_state->levels[level]; b != 0; b = b->link) {
   2169 			link_inedge(&b->et, JT(b));
   2170 			link_inedge(&b->ef, JF(b));
   2171 		}
   2172 	}
   2173 }
   2174 
   2175 static void
   2176 opt_root(struct block **b)
   2177 {
   2178 	struct slist *tmp, *s;
   2179 
   2180 	s = (*b)->stmts;
   2181 	(*b)->stmts = 0;
   2182 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
   2183 		*b = JT(*b);
   2184 
   2185 	tmp = (*b)->stmts;
   2186 	if (tmp != 0)
   2187 		sappend(s, tmp);
   2188 	(*b)->stmts = s;
   2189 
   2190 	/*
   2191 	 * If the root node is a return, then there is no
   2192 	 * point executing any statements (since the bpf machine
   2193 	 * has no side effects).
   2194 	 */
   2195 	if (BPF_CLASS((*b)->s.code) == BPF_RET)
   2196 		(*b)->stmts = 0;
   2197 }
   2198 
   2199 static void
   2200 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
   2201 {
   2202 
   2203 #ifdef BDEBUG
   2204 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   2205 		printf("opt_loop(root, %d) begin\n", do_stmts);
   2206 		opt_dump(opt_state, ic);
   2207 	}
   2208 #endif
   2209 
   2210 	/*
   2211 	 * XXX - optimizer loop detection.
   2212 	 */
   2213 	int loop_count = 0;
   2214 	for (;;) {
   2215 		opt_state->done = 1;
   2216 		/*
   2217 		 * XXX - optimizer loop detection.
   2218 		 */
   2219 		opt_state->non_branch_movement_performed = 0;
   2220 		find_levels(opt_state, ic);
   2221 		find_dom(opt_state, ic->root);
   2222 		find_closure(opt_state, ic->root);
   2223 		find_ud(opt_state, ic->root);
   2224 		find_edom(opt_state, ic->root);
   2225 		opt_blks(opt_state, ic, do_stmts);
   2226 #ifdef BDEBUG
   2227 		if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   2228 			printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
   2229 			opt_dump(opt_state, ic);
   2230 		}
   2231 #endif
   2232 
   2233 		/*
   2234 		 * Was anything done in this optimizer pass?
   2235 		 */
   2236 		if (opt_state->done) {
   2237 			/*
   2238 			 * No, so we've reached a fixed point.
   2239 			 * We're done.
   2240 			 */
   2241 			break;
   2242 		}
   2243 
   2244 		/*
   2245 		 * XXX - was anything done other than branch movement
   2246 		 * in this pass?
   2247 		 */
   2248 		if (opt_state->non_branch_movement_performed) {
   2249 			/*
   2250 			 * Yes.  Clear any loop-detection counter;
   2251 			 * we're making some form of progress (assuming
   2252 			 * we can't get into a cycle doing *other*
   2253 			 * optimizations...).
   2254 			 */
   2255 			loop_count = 0;
   2256 		} else {
   2257 			/*
   2258 			 * No - increment the counter, and quit if
   2259 			 * it's up to 100.
   2260 			 */
   2261 			loop_count++;
   2262 			if (loop_count >= 100) {
   2263 				/*
   2264 				 * We've done nothing but branch movement
   2265 				 * for 100 passes; we're probably
   2266 				 * in a cycle and will never reach a
   2267 				 * fixed point.
   2268 				 *
   2269 				 * XXX - yes, we really need a non-
   2270 				 * heuristic way of detecting a cycle.
   2271 				 */
   2272 				opt_state->done = 1;
   2273 				break;
   2274 			}
   2275 		}
   2276 	}
   2277 }
   2278 
   2279 /*
   2280  * Optimize the filter code in its dag representation.
   2281  * Return 0 on success, -1 on error.
   2282  */
   2283 int
   2284 bpf_optimize(struct icode *ic, char *errbuf)
   2285 {
   2286 	opt_state_t opt_state;
   2287 
   2288 	memset(&opt_state, 0, sizeof(opt_state));
   2289 	opt_state.errbuf = errbuf;
   2290 	opt_state.non_branch_movement_performed = 0;
   2291 	if (setjmp(opt_state.top_ctx)) {
   2292 		opt_cleanup(&opt_state);
   2293 		return -1;
   2294 	}
   2295 	opt_init(&opt_state, ic);
   2296 	opt_loop(&opt_state, ic, 0);
   2297 	opt_loop(&opt_state, ic, 1);
   2298 	intern_blocks(&opt_state, ic);
   2299 #ifdef BDEBUG
   2300 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   2301 		printf("after intern_blocks()\n");
   2302 		opt_dump(&opt_state, ic);
   2303 	}
   2304 #endif
   2305 	opt_root(&ic->root);
   2306 #ifdef BDEBUG
   2307 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   2308 		printf("after opt_root()\n");
   2309 		opt_dump(&opt_state, ic);
   2310 	}
   2311 #endif
   2312 	opt_cleanup(&opt_state);
   2313 	return 0;
   2314 }
   2315 
   2316 static void
   2317 make_marks(struct icode *ic, struct block *p)
   2318 {
   2319 	if (!isMarked(ic, p)) {
   2320 		Mark(ic, p);
   2321 		if (BPF_CLASS(p->s.code) != BPF_RET) {
   2322 			make_marks(ic, JT(p));
   2323 			make_marks(ic, JF(p));
   2324 		}
   2325 	}
   2326 }
   2327 
   2328 /*
   2329  * Mark code array such that isMarked(ic->cur_mark, i) is true
   2330  * only for nodes that are alive.
   2331  */
   2332 static void
   2333 mark_code(struct icode *ic)
   2334 {
   2335 	ic->cur_mark += 1;
   2336 	make_marks(ic, ic->root);
   2337 }
   2338 
   2339 /*
   2340  * True iff the two stmt lists load the same value from the packet into
   2341  * the accumulator.
   2342  */
   2343 static int
   2344 eq_slist(struct slist *x, struct slist *y)
   2345 {
   2346 	for (;;) {
   2347 		while (x && x->s.code == NOP)
   2348 			x = x->next;
   2349 		while (y && y->s.code == NOP)
   2350 			y = y->next;
   2351 		if (x == 0)
   2352 			return y == 0;
   2353 		if (y == 0)
   2354 			return x == 0;
   2355 		if (x->s.code != y->s.code || x->s.k != y->s.k)
   2356 			return 0;
   2357 		x = x->next;
   2358 		y = y->next;
   2359 	}
   2360 }
   2361 
   2362 static inline int
   2363 eq_blk(struct block *b0, struct block *b1)
   2364 {
   2365 	if (b0->s.code == b1->s.code &&
   2366 	    b0->s.k == b1->s.k &&
   2367 	    b0->et.succ == b1->et.succ &&
   2368 	    b0->ef.succ == b1->ef.succ)
   2369 		return eq_slist(b0->stmts, b1->stmts);
   2370 	return 0;
   2371 }
   2372 
   2373 static void
   2374 intern_blocks(opt_state_t *opt_state, struct icode *ic)
   2375 {
   2376 	struct block *p;
   2377 	u_int i, j;
   2378 	int done1; /* don't shadow global */
   2379  top:
   2380 	done1 = 1;
   2381 	for (i = 0; i < opt_state->n_blocks; ++i)
   2382 		opt_state->blocks[i]->link = 0;
   2383 
   2384 	mark_code(ic);
   2385 
   2386 	for (i = opt_state->n_blocks - 1; i != 0; ) {
   2387 		--i;
   2388 		if (!isMarked(ic, opt_state->blocks[i]))
   2389 			continue;
   2390 		for (j = i + 1; j < opt_state->n_blocks; ++j) {
   2391 			if (!isMarked(ic, opt_state->blocks[j]))
   2392 				continue;
   2393 			if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
   2394 				opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
   2395 					opt_state->blocks[j]->link : opt_state->blocks[j];
   2396 				break;
   2397 			}
   2398 		}
   2399 	}
   2400 	for (i = 0; i < opt_state->n_blocks; ++i) {
   2401 		p = opt_state->blocks[i];
   2402 		if (JT(p) == 0)
   2403 			continue;
   2404 		if (JT(p)->link) {
   2405 			done1 = 0;
   2406 			JT(p) = JT(p)->link;
   2407 		}
   2408 		if (JF(p)->link) {
   2409 			done1 = 0;
   2410 			JF(p) = JF(p)->link;
   2411 		}
   2412 	}
   2413 	if (!done1)
   2414 		goto top;
   2415 }
   2416 
   2417 static void
   2418 opt_cleanup(opt_state_t *opt_state)
   2419 {
   2420 	free((void *)opt_state->vnode_base);
   2421 	free((void *)opt_state->vmap);
   2422 	free((void *)opt_state->edges);
   2423 	free((void *)opt_state->space);
   2424 	free((void *)opt_state->levels);
   2425 	free((void *)opt_state->blocks);
   2426 }
   2427 
   2428 /*
   2429  * For optimizer errors.
   2430  */
   2431 static void PCAP_NORETURN
   2432 opt_error(opt_state_t *opt_state, const char *fmt, ...)
   2433 {
   2434 	va_list ap;
   2435 
   2436 	if (opt_state->errbuf != NULL) {
   2437 		va_start(ap, fmt);
   2438 		(void)vsnprintf(opt_state->errbuf,
   2439 		    PCAP_ERRBUF_SIZE, fmt, ap);
   2440 		va_end(ap);
   2441 	}
   2442 	longjmp(opt_state->top_ctx, 1);
   2443 	/* NOTREACHED */
   2444 #ifdef _AIX
   2445 	PCAP_UNREACHABLE
   2446 #endif /* _AIX */
   2447 }
   2448 
   2449 /*
   2450  * Return the number of stmts in 's'.
   2451  */
   2452 static u_int
   2453 slength(struct slist *s)
   2454 {
   2455 	u_int n = 0;
   2456 
   2457 	for (; s; s = s->next)
   2458 		if (s->s.code != NOP)
   2459 			++n;
   2460 	return n;
   2461 }
   2462 
   2463 /*
   2464  * Return the number of nodes reachable by 'p'.
   2465  * All nodes should be initially unmarked.
   2466  */
   2467 static int
   2468 count_blocks(struct icode *ic, struct block *p)
   2469 {
   2470 	if (p == 0 || isMarked(ic, p))
   2471 		return 0;
   2472 	Mark(ic, p);
   2473 	return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
   2474 }
   2475 
   2476 /*
   2477  * Do a depth first search on the flow graph, numbering the
   2478  * the basic blocks, and entering them into the 'blocks' array.`
   2479  */
   2480 static void
   2481 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
   2482 {
   2483 	u_int n;
   2484 
   2485 	if (p == 0 || isMarked(ic, p))
   2486 		return;
   2487 
   2488 	Mark(ic, p);
   2489 	n = opt_state->n_blocks++;
   2490 	if (opt_state->n_blocks == 0) {
   2491 		/*
   2492 		 * Overflow.
   2493 		 */
   2494 		opt_error(opt_state, "filter is too complex to optimize");
   2495 	}
   2496 	p->id = n;
   2497 	opt_state->blocks[n] = p;
   2498 
   2499 	number_blks_r(opt_state, ic, JT(p));
   2500 	number_blks_r(opt_state, ic, JF(p));
   2501 }
   2502 
   2503 /*
   2504  * Return the number of stmts in the flowgraph reachable by 'p'.
   2505  * The nodes should be unmarked before calling.
   2506  *
   2507  * Note that "stmts" means "instructions", and that this includes
   2508  *
   2509  *	side-effect statements in 'p' (slength(p->stmts));
   2510  *
   2511  *	statements in the true branch from 'p' (count_stmts(JT(p)));
   2512  *
   2513  *	statements in the false branch from 'p' (count_stmts(JF(p)));
   2514  *
   2515  *	the conditional jump itself (1);
   2516  *
   2517  *	an extra long jump if the true branch requires it (p->longjt);
   2518  *
   2519  *	an extra long jump if the false branch requires it (p->longjf).
   2520  */
   2521 static u_int
   2522 count_stmts(struct icode *ic, struct block *p)
   2523 {
   2524 	u_int n;
   2525 
   2526 	if (p == 0 || isMarked(ic, p))
   2527 		return 0;
   2528 	Mark(ic, p);
   2529 	n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
   2530 	return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
   2531 }
   2532 
   2533 /*
   2534  * Allocate memory.  All allocation is done before optimization
   2535  * is begun.  A linear bound on the size of all data structures is computed
   2536  * from the total number of blocks and/or statements.
   2537  */
   2538 static void
   2539 opt_init(opt_state_t *opt_state, struct icode *ic)
   2540 {
   2541 	bpf_u_int32 *p;
   2542 	int i, n, max_stmts;
   2543 	u_int product;
   2544 	size_t block_memsize, edge_memsize;
   2545 
   2546 	/*
   2547 	 * First, count the blocks, so we can malloc an array to map
   2548 	 * block number to block.  Then, put the blocks into the array.
   2549 	 */
   2550 	unMarkAll(ic);
   2551 	n = count_blocks(ic, ic->root);
   2552 	opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
   2553 	if (opt_state->blocks == NULL)
   2554 		opt_error(opt_state, "malloc");
   2555 	unMarkAll(ic);
   2556 	opt_state->n_blocks = 0;
   2557 	number_blks_r(opt_state, ic, ic->root);
   2558 
   2559 	/*
   2560 	 * This "should not happen".
   2561 	 */
   2562 	if (opt_state->n_blocks == 0)
   2563 		opt_error(opt_state, "filter has no instructions; please report this as a libpcap issue");
   2564 
   2565 	opt_state->n_edges = 2 * opt_state->n_blocks;
   2566 	if ((opt_state->n_edges / 2) != opt_state->n_blocks) {
   2567 		/*
   2568 		 * Overflow.
   2569 		 */
   2570 		opt_error(opt_state, "filter is too complex to optimize");
   2571 	}
   2572 	opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
   2573 	if (opt_state->edges == NULL) {
   2574 		opt_error(opt_state, "malloc");
   2575 	}
   2576 
   2577 	/*
   2578 	 * The number of levels is bounded by the number of nodes.
   2579 	 */
   2580 	opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
   2581 	if (opt_state->levels == NULL) {
   2582 		opt_error(opt_state, "malloc");
   2583 	}
   2584 
   2585 	opt_state->edgewords = opt_state->n_edges / BITS_PER_WORD + 1;
   2586 	opt_state->nodewords = opt_state->n_blocks / BITS_PER_WORD + 1;
   2587 
   2588 	/*
   2589 	 * Make sure opt_state->n_blocks * opt_state->nodewords fits
   2590 	 * in a u_int; we use it as a u_int number-of-iterations
   2591 	 * value.
   2592 	 */
   2593 	product = opt_state->n_blocks * opt_state->nodewords;
   2594 	if ((product / opt_state->n_blocks) != opt_state->nodewords) {
   2595 		/*
   2596 		 * XXX - just punt and don't try to optimize?
   2597 		 * In practice, this is unlikely to happen with
   2598 		 * a normal filter.
   2599 		 */
   2600 		opt_error(opt_state, "filter is too complex to optimize");
   2601 	}
   2602 
   2603 	/*
   2604 	 * Make sure the total memory required for that doesn't
   2605 	 * overflow.
   2606 	 */
   2607 	block_memsize = (size_t)2 * product * sizeof(*opt_state->space);
   2608 	if ((block_memsize / product) != 2 * sizeof(*opt_state->space)) {
   2609 		opt_error(opt_state, "filter is too complex to optimize");
   2610 	}
   2611 
   2612 	/*
   2613 	 * Make sure opt_state->n_edges * opt_state->edgewords fits
   2614 	 * in a u_int; we use it as a u_int number-of-iterations
   2615 	 * value.
   2616 	 */
   2617 	product = opt_state->n_edges * opt_state->edgewords;
   2618 	if ((product / opt_state->n_edges) != opt_state->edgewords) {
   2619 		opt_error(opt_state, "filter is too complex to optimize");
   2620 	}
   2621 
   2622 	/*
   2623 	 * Make sure the total memory required for that doesn't
   2624 	 * overflow.
   2625 	 */
   2626 	edge_memsize = (size_t)product * sizeof(*opt_state->space);
   2627 	if (edge_memsize / product != sizeof(*opt_state->space)) {
   2628 		opt_error(opt_state, "filter is too complex to optimize");
   2629 	}
   2630 
   2631 	/*
   2632 	 * Make sure the total memory required for both of them doesn't
   2633 	 * overflow.
   2634 	 */
   2635 	if (block_memsize > SIZE_MAX - edge_memsize) {
   2636 		opt_error(opt_state, "filter is too complex to optimize");
   2637 	}
   2638 
   2639 	/* XXX */
   2640 	opt_state->space = (bpf_u_int32 *)malloc(block_memsize + edge_memsize);
   2641 	if (opt_state->space == NULL) {
   2642 		opt_error(opt_state, "malloc");
   2643 	}
   2644 	p = opt_state->space;
   2645 	opt_state->all_dom_sets = p;
   2646 	for (i = 0; i < n; ++i) {
   2647 		opt_state->blocks[i]->dom = p;
   2648 		p += opt_state->nodewords;
   2649 	}
   2650 	opt_state->all_closure_sets = p;
   2651 	for (i = 0; i < n; ++i) {
   2652 		opt_state->blocks[i]->closure = p;
   2653 		p += opt_state->nodewords;
   2654 	}
   2655 	opt_state->all_edge_sets = p;
   2656 	for (i = 0; i < n; ++i) {
   2657 		register struct block *b = opt_state->blocks[i];
   2658 
   2659 		b->et.edom = p;
   2660 		p += opt_state->edgewords;
   2661 		b->ef.edom = p;
   2662 		p += opt_state->edgewords;
   2663 		b->et.id = i;
   2664 		opt_state->edges[i] = &b->et;
   2665 		b->ef.id = opt_state->n_blocks + i;
   2666 		opt_state->edges[opt_state->n_blocks + i] = &b->ef;
   2667 		b->et.pred = b;
   2668 		b->ef.pred = b;
   2669 	}
   2670 	max_stmts = 0;
   2671 	for (i = 0; i < n; ++i)
   2672 		max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
   2673 	/*
   2674 	 * We allocate at most 3 value numbers per statement,
   2675 	 * so this is an upper bound on the number of valnodes
   2676 	 * we'll need.
   2677 	 */
   2678 	opt_state->maxval = 3 * max_stmts;
   2679 	opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
   2680 	if (opt_state->vmap == NULL) {
   2681 		opt_error(opt_state, "malloc");
   2682 	}
   2683 	opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
   2684 	if (opt_state->vnode_base == NULL) {
   2685 		opt_error(opt_state, "malloc");
   2686 	}
   2687 }
   2688 
   2689 /*
   2690  * This is only used when supporting optimizer debugging.  It is
   2691  * global state, so do *not* do more than one compile in parallel
   2692  * and expect it to provide meaningful information.
   2693  */
   2694 #ifdef BDEBUG
   2695 int bids[NBIDS];
   2696 #endif
   2697 
   2698 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
   2699     PCAP_PRINTFLIKE(2, 3);
   2700 
   2701 /*
   2702  * Returns true if successful.  Returns false if a branch has
   2703  * an offset that is too large.  If so, we have marked that
   2704  * branch so that on a subsequent iteration, it will be treated
   2705  * properly.
   2706  */
   2707 static int
   2708 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
   2709 {
   2710 	struct bpf_insn *dst;
   2711 	struct slist *src;
   2712 	u_int slen;
   2713 	u_int off;
   2714 	struct slist **offset = NULL;
   2715 
   2716 	if (p == 0 || isMarked(ic, p))
   2717 		return (1);
   2718 	Mark(ic, p);
   2719 
   2720 	if (convert_code_r(conv_state, ic, JF(p)) == 0)
   2721 		return (0);
   2722 	if (convert_code_r(conv_state, ic, JT(p)) == 0)
   2723 		return (0);
   2724 
   2725 	slen = slength(p->stmts);
   2726 	dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
   2727 		/* inflate length by any extra jumps */
   2728 
   2729 	p->offset = (int)(dst - conv_state->fstart);
   2730 
   2731 	/* generate offset[] for convenience  */
   2732 	if (slen) {
   2733 		offset = (struct slist **)calloc(slen, sizeof(struct slist *));
   2734 		if (!offset) {
   2735 			conv_error(conv_state, "not enough core");
   2736 			/*NOTREACHED*/
   2737 		}
   2738 	}
   2739 	src = p->stmts;
   2740 	for (off = 0; off < slen && src; off++) {
   2741 #if 0
   2742 		printf("off=%d src=%x\n", off, src);
   2743 #endif
   2744 		offset[off] = src;
   2745 		src = src->next;
   2746 	}
   2747 
   2748 	off = 0;
   2749 	for (src = p->stmts; src; src = src->next) {
   2750 		if (src->s.code == NOP)
   2751 			continue;
   2752 		dst->code = (u_short)src->s.code;
   2753 		dst->k = src->s.k;
   2754 
   2755 		/* fill block-local relative jump */
   2756 		if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
   2757 #if 0
   2758 			if (src->s.jt || src->s.jf) {
   2759 				free(offset);
   2760 				conv_error(conv_state, "illegal jmp destination");
   2761 				/*NOTREACHED*/
   2762 			}
   2763 #endif
   2764 			goto filled;
   2765 		}
   2766 		if (off == slen - 2)	/*???*/
   2767 			goto filled;
   2768 
   2769 	    {
   2770 		u_int i;
   2771 		int jt, jf;
   2772 		const char ljerr[] = "%s for block-local relative jump: off=%d";
   2773 
   2774 #if 0
   2775 		printf("code=%x off=%d %x %x\n", src->s.code,
   2776 			off, src->s.jt, src->s.jf);
   2777 #endif
   2778 
   2779 		if (!src->s.jt || !src->s.jf) {
   2780 			free(offset);
   2781 			conv_error(conv_state, ljerr, "no jmp destination", off);
   2782 			/*NOTREACHED*/
   2783 		}
   2784 
   2785 		jt = jf = 0;
   2786 		for (i = 0; i < slen; i++) {
   2787 			if (offset[i] == src->s.jt) {
   2788 				if (jt) {
   2789 					free(offset);
   2790 					conv_error(conv_state, ljerr, "multiple matches", off);
   2791 					/*NOTREACHED*/
   2792 				}
   2793 
   2794 				if (i - off - 1 >= 256) {
   2795 					free(offset);
   2796 					conv_error(conv_state, ljerr, "out-of-range jump", off);
   2797 					/*NOTREACHED*/
   2798 				}
   2799 				dst->jt = (u_char)(i - off - 1);
   2800 				jt++;
   2801 			}
   2802 			if (offset[i] == src->s.jf) {
   2803 				if (jf) {
   2804 					free(offset);
   2805 					conv_error(conv_state, ljerr, "multiple matches", off);
   2806 					/*NOTREACHED*/
   2807 				}
   2808 				if (i - off - 1 >= 256) {
   2809 					free(offset);
   2810 					conv_error(conv_state, ljerr, "out-of-range jump", off);
   2811 					/*NOTREACHED*/
   2812 				}
   2813 				dst->jf = (u_char)(i - off - 1);
   2814 				jf++;
   2815 			}
   2816 		}
   2817 		if (!jt || !jf) {
   2818 			free(offset);
   2819 			conv_error(conv_state, ljerr, "no destination found", off);
   2820 			/*NOTREACHED*/
   2821 		}
   2822 	    }
   2823 filled:
   2824 		++dst;
   2825 		++off;
   2826 	}
   2827 	if (offset)
   2828 		free(offset);
   2829 
   2830 #ifdef BDEBUG
   2831 	if (dst - conv_state->fstart < NBIDS)
   2832 		bids[dst - conv_state->fstart] = p->id + 1;
   2833 #endif
   2834 	dst->code = (u_short)p->s.code;
   2835 	dst->k = p->s.k;
   2836 	if (JT(p)) {
   2837 		/* number of extra jumps inserted */
   2838 		u_char extrajmps = 0;
   2839 		off = JT(p)->offset - (p->offset + slen) - 1;
   2840 		if (off >= 256) {
   2841 		    /* offset too large for branch, must add a jump */
   2842 		    if (p->longjt == 0) {
   2843 			/* mark this instruction and retry */
   2844 			p->longjt++;
   2845 			return(0);
   2846 		    }
   2847 		    dst->jt = extrajmps;
   2848 		    extrajmps++;
   2849 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
   2850 		    dst[extrajmps].k = off - extrajmps;
   2851 		}
   2852 		else
   2853 		    dst->jt = (u_char)off;
   2854 		off = JF(p)->offset - (p->offset + slen) - 1;
   2855 		if (off >= 256) {
   2856 		    /* offset too large for branch, must add a jump */
   2857 		    if (p->longjf == 0) {
   2858 			/* mark this instruction and retry */
   2859 			p->longjf++;
   2860 			return(0);
   2861 		    }
   2862 		    /* branch if F to following jump */
   2863 		    /* if two jumps are inserted, F goes to second one */
   2864 		    dst->jf = extrajmps;
   2865 		    extrajmps++;
   2866 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
   2867 		    dst[extrajmps].k = off - extrajmps;
   2868 		}
   2869 		else
   2870 		    dst->jf = (u_char)off;
   2871 	}
   2872 	return (1);
   2873 }
   2874 
   2875 
   2876 /*
   2877  * Convert flowgraph intermediate representation to the
   2878  * BPF array representation.  Set *lenp to the number of instructions.
   2879  *
   2880  * This routine does *NOT* leak the memory pointed to by fp.  It *must
   2881  * not* do free(fp) before returning fp; doing so would make no sense,
   2882  * as the BPF array pointed to by the return value of icode_to_fcode()
   2883  * must be valid - it's being returned for use in a bpf_program structure.
   2884  *
   2885  * If it appears that icode_to_fcode() is leaking, the problem is that
   2886  * the program using pcap_compile() is failing to free the memory in
   2887  * the BPF program when it's done - the leak is in the program, not in
   2888  * the routine that happens to be allocating the memory.  (By analogy, if
   2889  * a program calls fopen() without ever calling fclose() on the FILE *,
   2890  * it will leak the FILE structure; the leak is not in fopen(), it's in
   2891  * the program.)  Change the program to use pcap_freecode() when it's
   2892  * done with the filter program.  See the pcap man page.
   2893  */
   2894 struct bpf_insn *
   2895 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
   2896     char *errbuf)
   2897 {
   2898 	u_int n;
   2899 	struct bpf_insn *fp;
   2900 	conv_state_t conv_state;
   2901 
   2902 	conv_state.fstart = NULL;
   2903 	conv_state.errbuf = errbuf;
   2904 	if (setjmp(conv_state.top_ctx) != 0) {
   2905 		free(conv_state.fstart);
   2906 		return NULL;
   2907 	}
   2908 
   2909 	/*
   2910 	 * Loop doing convert_code_r() until no branches remain
   2911 	 * with too-large offsets.
   2912 	 */
   2913 	for (;;) {
   2914 	    unMarkAll(ic);
   2915 	    n = *lenp = count_stmts(ic, root);
   2916 
   2917 	    fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
   2918 	    if (fp == NULL) {
   2919 		(void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
   2920 		    "malloc");
   2921 		return NULL;
   2922 	    }
   2923 	    memset((char *)fp, 0, sizeof(*fp) * n);
   2924 	    conv_state.fstart = fp;
   2925 	    conv_state.ftail = fp + n;
   2926 
   2927 	    unMarkAll(ic);
   2928 	    if (convert_code_r(&conv_state, ic, root))
   2929 		break;
   2930 	    free(fp);
   2931 	}
   2932 
   2933 	return fp;
   2934 }
   2935 
   2936 /*
   2937  * For iconv_to_fconv() errors.
   2938  */
   2939 static void PCAP_NORETURN
   2940 conv_error(conv_state_t *conv_state, const char *fmt, ...)
   2941 {
   2942 	va_list ap;
   2943 
   2944 	va_start(ap, fmt);
   2945 	(void)vsnprintf(conv_state->errbuf,
   2946 	    PCAP_ERRBUF_SIZE, fmt, ap);
   2947 	va_end(ap);
   2948 	longjmp(conv_state->top_ctx, 1);
   2949 	/* NOTREACHED */
   2950 #ifdef _AIX
   2951 	PCAP_UNREACHABLE
   2952 #endif /* _AIX */
   2953 }
   2954 
   2955 /*
   2956  * Make a copy of a BPF program and put it in the "fcode" member of
   2957  * a "pcap_t".
   2958  *
   2959  * If we fail to allocate memory for the copy, fill in the "errbuf"
   2960  * member of the "pcap_t" with an error message, and return -1;
   2961  * otherwise, return 0.
   2962  */
   2963 int
   2964 pcapint_install_bpf_program(pcap_t *p, struct bpf_program *fp)
   2965 {
   2966 	size_t prog_size;
   2967 
   2968 	/*
   2969 	 * Validate the program.
   2970 	 */
   2971 	if (!pcapint_validate_filter(fp->bf_insns, fp->bf_len)) {
   2972 		snprintf(p->errbuf, sizeof(p->errbuf),
   2973 			"BPF program is not valid");
   2974 		return (-1);
   2975 	}
   2976 
   2977 	/*
   2978 	 * Free up any already installed program.
   2979 	 */
   2980 	pcap_freecode(&p->fcode);
   2981 
   2982 	prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
   2983 	p->fcode.bf_len = fp->bf_len;
   2984 	p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
   2985 	if (p->fcode.bf_insns == NULL) {
   2986 		pcapint_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
   2987 		    errno, "malloc");
   2988 		return (-1);
   2989 	}
   2990 	memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
   2991 	return (0);
   2992 }
   2993 
   2994 #ifdef BDEBUG
   2995 static void
   2996 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
   2997     FILE *out)
   2998 {
   2999 	int icount, noffset;
   3000 	int i;
   3001 
   3002 	if (block == NULL || isMarked(ic, block))
   3003 		return;
   3004 	Mark(ic, block);
   3005 
   3006 	icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
   3007 	noffset = min(block->offset + icount, (int)prog->bf_len);
   3008 
   3009 	fprintf(out, "\tblock%u [shape=ellipse, id=\"block-%u\" label=\"BLOCK%u\\n", block->id, block->id, block->id);
   3010 	for (i = block->offset; i < noffset; i++) {
   3011 		fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
   3012 	}
   3013 	fprintf(out, "\" tooltip=\"");
   3014 	for (i = 0; i < BPF_MEMWORDS; i++)
   3015 		if (block->val[i] != VAL_UNKNOWN)
   3016 			fprintf(out, "val[%d]=%d ", i, block->val[i]);
   3017 	fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
   3018 	fprintf(out, "val[X]=%d", block->val[X_ATOM]);
   3019 	fprintf(out, "\"");
   3020 	if (JT(block) == NULL)
   3021 		fprintf(out, ", peripheries=2");
   3022 	fprintf(out, "];\n");
   3023 
   3024 	dot_dump_node(ic, JT(block), prog, out);
   3025 	dot_dump_node(ic, JF(block), prog, out);
   3026 }
   3027 
   3028 static void
   3029 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
   3030 {
   3031 	if (block == NULL || isMarked(ic, block))
   3032 		return;
   3033 	Mark(ic, block);
   3034 
   3035 	if (JT(block)) {
   3036 		fprintf(out, "\t\"block%u\":se -> \"block%u\":n [label=\"T\"]; \n",
   3037 				block->id, JT(block)->id);
   3038 		fprintf(out, "\t\"block%u\":sw -> \"block%u\":n [label=\"F\"]; \n",
   3039 			   block->id, JF(block)->id);
   3040 	}
   3041 	dot_dump_edge(ic, JT(block), out);
   3042 	dot_dump_edge(ic, JF(block), out);
   3043 }
   3044 
   3045 /* Output the block CFG using graphviz/DOT language
   3046  * In the CFG, block's code, value index for each registers at EXIT,
   3047  * and the jump relationship is show.
   3048  *
   3049  * example DOT for BPF `ip src host 1.1.1.1' is:
   3050     digraph BPF {
   3051 	block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2	jf 5" tooltip="val[A]=0 val[X]=0"];
   3052 	block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4	jf 5" tooltip="val[A]=0 val[X]=0"];
   3053 	block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
   3054 	block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
   3055 	"block0":se -> "block1":n [label="T"];
   3056 	"block0":sw -> "block3":n [label="F"];
   3057 	"block1":se -> "block2":n [label="T"];
   3058 	"block1":sw -> "block3":n [label="F"];
   3059     }
   3060  *
   3061  *  After install graphviz on https://www.graphviz.org/, save it as bpf.dot
   3062  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
   3063  */
   3064 static int
   3065 dot_dump(struct icode *ic, char *errbuf)
   3066 {
   3067 	struct bpf_program f;
   3068 	FILE *out = stdout;
   3069 
   3070 	memset(bids, 0, sizeof bids);
   3071 	f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
   3072 	if (f.bf_insns == NULL)
   3073 		return -1;
   3074 
   3075 	fprintf(out, "digraph BPF {\n");
   3076 	unMarkAll(ic);
   3077 	dot_dump_node(ic, ic->root, &f, out);
   3078 	unMarkAll(ic);
   3079 	dot_dump_edge(ic, ic->root, out);
   3080 	fprintf(out, "}\n");
   3081 
   3082 	free((char *)f.bf_insns);
   3083 	return 0;
   3084 }
   3085 
   3086 static int
   3087 plain_dump(struct icode *ic, char *errbuf)
   3088 {
   3089 	struct bpf_program f;
   3090 
   3091 	memset(bids, 0, sizeof bids);
   3092 	f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
   3093 	if (f.bf_insns == NULL)
   3094 		return -1;
   3095 	bpf_dump(&f, 1);
   3096 	putchar('\n');
   3097 	free((char *)f.bf_insns);
   3098 	return 0;
   3099 }
   3100 
   3101 static void
   3102 opt_dump(opt_state_t *opt_state, struct icode *ic)
   3103 {
   3104 	int status;
   3105 	char errbuf[PCAP_ERRBUF_SIZE];
   3106 
   3107 	/*
   3108 	 * If the CFG, in DOT format, is requested, output it rather than
   3109 	 * the code that would be generated from that graph.
   3110 	 */
   3111 	if (pcap_print_dot_graph)
   3112 		status = dot_dump(ic, errbuf);
   3113 	else
   3114 		status = plain_dump(ic, errbuf);
   3115 	if (status == -1)
   3116 		opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf);
   3117 }
   3118 #endif
   3119