Home | History | Annotate | Line # | Download | only in dist
optimize.c revision 1.1.1.11
      1 /*
      2  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
      3  *	The Regents of the University of California.  All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that: (1) source code distributions
      7  * retain the above copyright notice and this paragraph in its entirety, (2)
      8  * distributions including binary code include the above copyright notice and
      9  * this paragraph in its entirety in the documentation or other materials
     10  * provided with the distribution, and (3) all advertising materials mentioning
     11  * features or use of this software display the following acknowledgement:
     12  * ``This product includes software developed by the University of California,
     13  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
     14  * the University nor the names of its contributors may be used to endorse
     15  * or promote products derived from this software without specific prior
     16  * written permission.
     17  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
     18  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
     19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
     20  *
     21  *  Optimization module for BPF code intermediate representation.
     22  */
     23 
     24 #include <config.h>
     25 
     26 #include <pcap-types.h>
     27 
     28 #include <stdio.h>
     29 #include <stdlib.h>
     30 #include <memory.h>
     31 #include <setjmp.h>
     32 #include <string.h>
     33 #include <limits.h> /* for SIZE_MAX */
     34 #include <errno.h>
     35 
     36 #include "pcap-int.h"
     37 
     38 #include "gencode.h"
     39 #include "optimize.h"
     40 #include "diag-control.h"
     41 
     42 #ifdef HAVE_OS_PROTO_H
     43 #include "os-proto.h"
     44 #endif
     45 
     46 #ifdef BDEBUG
     47 /*
     48  * The internal "debug printout" flag for the filter expression optimizer.
     49  * The code to print that stuff is present only if BDEBUG is defined, so
     50  * the flag, and the routine to set it, are defined only if BDEBUG is
     51  * defined.
     52  */
     53 static int pcap_optimizer_debug;
     54 
     55 /*
     56  * Routine to set that flag.
     57  *
     58  * This is intended for libpcap developers, not for general use.
     59  * If you want to set these in a program, you'll have to declare this
     60  * routine yourself, with the appropriate DLL import attribute on Windows;
     61  * it's not declared in any header file, and won't be declared in any
     62  * header file provided by libpcap.
     63  */
     64 PCAP_API void pcap_set_optimizer_debug(int value);
     65 
     66 PCAP_API_DEF void
     67 pcap_set_optimizer_debug(int value)
     68 {
     69 	pcap_optimizer_debug = value;
     70 }
     71 
     72 /*
     73  * The internal "print dot graph" flag for the filter expression optimizer.
     74  * The code to print that stuff is present only if BDEBUG is defined, so
     75  * the flag, and the routine to set it, are defined only if BDEBUG is
     76  * defined.
     77  */
     78 static int pcap_print_dot_graph;
     79 
     80 /*
     81  * Routine to set that flag.
     82  *
     83  * This is intended for libpcap developers, not for general use.
     84  * If you want to set these in a program, you'll have to declare this
     85  * routine yourself, with the appropriate DLL import attribute on Windows;
     86  * it's not declared in any header file, and won't be declared in any
     87  * header file provided by libpcap.
     88  */
     89 PCAP_API void pcap_set_print_dot_graph(int value);
     90 
     91 PCAP_API_DEF void
     92 pcap_set_print_dot_graph(int value)
     93 {
     94 	pcap_print_dot_graph = value;
     95 }
     96 
     97 #endif
     98 
     99 /*
    100  * lowest_set_bit().
    101  *
    102  * Takes a 32-bit integer as an argument.
    103  *
    104  * If handed a non-zero value, returns the index of the lowest set bit,
    105  * counting upwards from zero.
    106  *
    107  * If handed zero, the results are platform- and compiler-dependent.
    108  * Keep it out of the light, don't give it any water, don't feed it
    109  * after midnight, and don't pass zero to it.
    110  *
    111  * This is the same as the count of trailing zeroes in the word.
    112  */
    113 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
    114   /*
    115    * GCC 3.4 and later; we have __builtin_ctz().
    116    */
    117   #define lowest_set_bit(mask) ((u_int)__builtin_ctz(mask))
    118 #elif defined(_MSC_VER)
    119   /*
    120    * Visual Studio; we support only 2005 and later, so use
    121    * _BitScanForward().
    122    */
    123 #include <intrin.h>
    124 
    125 #ifndef __clang__
    126 #pragma intrinsic(_BitScanForward)
    127 #endif
    128 
    129 static __forceinline u_int
    130 lowest_set_bit(int mask)
    131 {
    132 	unsigned long bit;
    133 
    134 	/*
    135 	 * Don't sign-extend mask if long is longer than int.
    136 	 * (It's currently not, in MSVC, even on 64-bit platforms, but....)
    137 	 */
    138 	if (_BitScanForward(&bit, (unsigned int)mask) == 0)
    139 		abort();	/* mask is zero */
    140 	return (u_int)bit;
    141 }
    142 #elif defined(MSDOS) && defined(__DJGPP__)
    143   /*
    144    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
    145    * we've already included.
    146    */
    147   #define lowest_set_bit(mask)	((u_int)(ffs((mask)) - 1))
    148 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
    149   /*
    150    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
    151    * or some other platform (UN*X conforming to a sufficient recent version
    152    * of the Single UNIX Specification).
    153    */
    154   #include <strings.h>
    155   #define lowest_set_bit(mask)	(u_int)((ffs((mask)) - 1))
    156 #else
    157 /*
    158  * None of the above.
    159  * Use a perfect-hash-function-based function.
    160  */
    161 static u_int
    162 lowest_set_bit(int mask)
    163 {
    164 	unsigned int v = (unsigned int)mask;
    165 
    166 	static const u_int MultiplyDeBruijnBitPosition[32] = {
    167 		0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
    168 		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
    169 	};
    170 
    171 	/*
    172 	 * We strip off all but the lowermost set bit (v & ~v),
    173 	 * and perform a minimal perfect hash on it to look up the
    174 	 * number of low-order zero bits in a table.
    175 	 *
    176 	 * See:
    177 	 *
    178 	 *	http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
    179 	 *
    180 	 *	http://supertech.csail.mit.edu/papers/debruijn.pdf
    181 	 */
    182 	return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
    183 }
    184 #endif
    185 
    186 /*
    187  * Represents a deleted instruction.
    188  */
    189 #define NOP -1
    190 
    191 /*
    192  * Register numbers for use-def values.
    193  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
    194  * location.  A_ATOM is the accumulator and X_ATOM is the index
    195  * register.
    196  */
    197 #define A_ATOM BPF_MEMWORDS
    198 #define X_ATOM (BPF_MEMWORDS+1)
    199 
    200 /*
    201  * This define is used to represent *both* the accumulator and
    202  * x register in use-def computations.
    203  * Currently, the use-def code assumes only one definition per instruction.
    204  */
    205 #define AX_ATOM N_ATOMS
    206 
    207 /*
    208  * These data structures are used in a Cocke and Schwartz style
    209  * value numbering scheme.  Since the flowgraph is acyclic,
    210  * exit values can be propagated from a node's predecessors
    211  * provided it is uniquely defined.
    212  */
    213 struct valnode {
    214 	int code;
    215 	bpf_u_int32 v0, v1;
    216 	int val;		/* the value number */
    217 	struct valnode *next;
    218 };
    219 
    220 /* Integer constants mapped with the load immediate opcode. */
    221 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0U)
    222 
    223 struct vmapinfo {
    224 	int is_const;
    225 	bpf_u_int32 const_val;
    226 };
    227 
    228 typedef struct {
    229 	/*
    230 	 * Place to longjmp to on an error.
    231 	 */
    232 	jmp_buf top_ctx;
    233 
    234 	/*
    235 	 * The buffer into which to put error message.
    236 	 */
    237 	char *errbuf;
    238 
    239 	/*
    240 	 * A flag to indicate that further optimization is needed.
    241 	 * Iterative passes are continued until a given pass yields no
    242 	 * code simplification or branch movement.
    243 	 */
    244 	int done;
    245 
    246 	/*
    247 	 * XXX - detect loops that do nothing but repeated AND/OR pullups
    248 	 * and edge moves.
    249 	 * If 100 passes in a row do nothing but that, treat that as a
    250 	 * sign that we're in a loop that just shuffles in a cycle in
    251 	 * which each pass just shuffles the code and we eventually
    252 	 * get back to the original configuration.
    253 	 *
    254 	 * XXX - we need a non-heuristic way of detecting, or preventing,
    255 	 * such a cycle.
    256 	 */
    257 	int non_branch_movement_performed;
    258 
    259 	u_int n_blocks;		/* number of blocks in the CFG; guaranteed to be > 0, as it's a RET instruction at a minimum */
    260 	struct block **blocks;
    261 	u_int n_edges;		/* twice n_blocks, so guaranteed to be > 0 */
    262 	struct edge **edges;
    263 
    264 	/*
    265 	 * A bit vector set representation of the dominators.
    266 	 * We round up the set size to the next power of two.
    267 	 */
    268 	u_int nodewords;	/* number of 32-bit words for a bit vector of "number of nodes" bits; guaranteed to be > 0 */
    269 	u_int edgewords;	/* number of 32-bit words for a bit vector of "number of edges" bits; guaranteed to be > 0 */
    270 	struct block **levels;
    271 	bpf_u_int32 *space;
    272 
    273 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
    274 /*
    275  * True if a is in uset {p}
    276  */
    277 #define SET_MEMBER(p, a) \
    278 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
    279 
    280 /*
    281  * Add 'a' to uset p.
    282  */
    283 #define SET_INSERT(p, a) \
    284 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
    285 
    286 /*
    287  * Delete 'a' from uset p.
    288  */
    289 #define SET_DELETE(p, a) \
    290 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
    291 
    292 /*
    293  * a := a intersect b
    294  * n must be guaranteed to be > 0
    295  */
    296 #define SET_INTERSECT(a, b, n)\
    297 {\
    298 	register bpf_u_int32 *_x = a, *_y = b;\
    299 	register u_int _n = n;\
    300 	do *_x++ &= *_y++; while (--_n != 0);\
    301 }
    302 
    303 /*
    304  * a := a - b
    305  * n must be guaranteed to be > 0
    306  */
    307 #define SET_SUBTRACT(a, b, n)\
    308 {\
    309 	register bpf_u_int32 *_x = a, *_y = b;\
    310 	register u_int _n = n;\
    311 	do *_x++ &=~ *_y++; while (--_n != 0);\
    312 }
    313 
    314 /*
    315  * a := a union b
    316  * n must be guaranteed to be > 0
    317  */
    318 #define SET_UNION(a, b, n)\
    319 {\
    320 	register bpf_u_int32 *_x = a, *_y = b;\
    321 	register u_int _n = n;\
    322 	do *_x++ |= *_y++; while (--_n != 0);\
    323 }
    324 
    325 	uset all_dom_sets;
    326 	uset all_closure_sets;
    327 	uset all_edge_sets;
    328 
    329 #define MODULUS 213
    330 	struct valnode *hashtbl[MODULUS];
    331 	bpf_u_int32 curval;
    332 	bpf_u_int32 maxval;
    333 
    334 	struct vmapinfo *vmap;
    335 	struct valnode *vnode_base;
    336 	struct valnode *next_vnode;
    337 } opt_state_t;
    338 
    339 typedef struct {
    340 	/*
    341 	 * Place to longjmp to on an error.
    342 	 */
    343 	jmp_buf top_ctx;
    344 
    345 	/*
    346 	 * The buffer into which to put error message.
    347 	 */
    348 	char *errbuf;
    349 
    350 	/*
    351 	 * Some pointers used to convert the basic block form of the code,
    352 	 * into the array form that BPF requires.  'fstart' will point to
    353 	 * the malloc'd array while 'ftail' is used during the recursive
    354 	 * traversal.
    355 	 */
    356 	struct bpf_insn *fstart;
    357 	struct bpf_insn *ftail;
    358 } conv_state_t;
    359 
    360 static void opt_init(opt_state_t *, struct icode *);
    361 static void opt_cleanup(opt_state_t *);
    362 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
    363     PCAP_PRINTFLIKE(2, 3);
    364 
    365 static void intern_blocks(opt_state_t *, struct icode *);
    366 
    367 static void find_inedges(opt_state_t *, struct block *);
    368 #ifdef BDEBUG
    369 static void opt_dump(opt_state_t *, struct icode *);
    370 #endif
    371 
    372 #ifndef MAX
    373 #define MAX(a,b) ((a)>(b)?(a):(b))
    374 #endif
    375 
    376 static void
    377 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
    378 {
    379 	int level;
    380 
    381 	if (isMarked(ic, b))
    382 		return;
    383 
    384 	Mark(ic, b);
    385 	b->link = 0;
    386 
    387 	if (JT(b)) {
    388 		find_levels_r(opt_state, ic, JT(b));
    389 		find_levels_r(opt_state, ic, JF(b));
    390 		level = MAX(JT(b)->level, JF(b)->level) + 1;
    391 	} else
    392 		level = 0;
    393 	b->level = level;
    394 	b->link = opt_state->levels[level];
    395 	opt_state->levels[level] = b;
    396 }
    397 
    398 /*
    399  * Level graph.  The levels go from 0 at the leaves to
    400  * N_LEVELS at the root.  The opt_state->levels[] array points to the
    401  * first node of the level list, whose elements are linked
    402  * with the 'link' field of the struct block.
    403  */
    404 static void
    405 find_levels(opt_state_t *opt_state, struct icode *ic)
    406 {
    407 	memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
    408 	unMarkAll(ic);
    409 	find_levels_r(opt_state, ic, ic->root);
    410 }
    411 
    412 /*
    413  * Find dominator relationships.
    414  * Assumes graph has been leveled.
    415  */
    416 static void
    417 find_dom(opt_state_t *opt_state, struct block *root)
    418 {
    419 	u_int i;
    420 	int level;
    421 	struct block *b;
    422 	bpf_u_int32 *x;
    423 
    424 	/*
    425 	 * Initialize sets to contain all nodes.
    426 	 */
    427 	x = opt_state->all_dom_sets;
    428 	/*
    429 	 * In opt_init(), we've made sure the product doesn't overflow.
    430 	 */
    431 	i = opt_state->n_blocks * opt_state->nodewords;
    432 	while (i != 0) {
    433 		--i;
    434 		*x++ = 0xFFFFFFFFU;
    435 	}
    436 	/* Root starts off empty. */
    437 	for (i = opt_state->nodewords; i != 0;) {
    438 		--i;
    439 		root->dom[i] = 0;
    440 	}
    441 
    442 	/* root->level is the highest level no found. */
    443 	for (level = root->level; level >= 0; --level) {
    444 		for (b = opt_state->levels[level]; b; b = b->link) {
    445 			SET_INSERT(b->dom, b->id);
    446 			if (JT(b) == 0)
    447 				continue;
    448 			SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
    449 			SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
    450 		}
    451 	}
    452 }
    453 
    454 static void
    455 propedom(opt_state_t *opt_state, struct edge *ep)
    456 {
    457 	SET_INSERT(ep->edom, ep->id);
    458 	if (ep->succ) {
    459 		SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
    460 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
    461 	}
    462 }
    463 
    464 /*
    465  * Compute edge dominators.
    466  * Assumes graph has been leveled and predecessors established.
    467  */
    468 static void
    469 find_edom(opt_state_t *opt_state, struct block *root)
    470 {
    471 	u_int i;
    472 	uset x;
    473 	int level;
    474 	struct block *b;
    475 
    476 	x = opt_state->all_edge_sets;
    477 	/*
    478 	 * In opt_init(), we've made sure the product doesn't overflow.
    479 	 */
    480 	for (i = opt_state->n_edges * opt_state->edgewords; i != 0; ) {
    481 		--i;
    482 		x[i] = 0xFFFFFFFFU;
    483 	}
    484 
    485 	/* root->level is the highest level no found. */
    486 	memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
    487 	memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
    488 	for (level = root->level; level >= 0; --level) {
    489 		for (b = opt_state->levels[level]; b != 0; b = b->link) {
    490 			propedom(opt_state, &b->et);
    491 			propedom(opt_state, &b->ef);
    492 		}
    493 	}
    494 }
    495 
    496 /*
    497  * Find the backwards transitive closure of the flow graph.  These sets
    498  * are backwards in the sense that we find the set of nodes that reach
    499  * a given node, not the set of nodes that can be reached by a node.
    500  *
    501  * Assumes graph has been leveled.
    502  */
    503 static void
    504 find_closure(opt_state_t *opt_state, struct block *root)
    505 {
    506 	int level;
    507 	struct block *b;
    508 
    509 	/*
    510 	 * Initialize sets to contain no nodes.
    511 	 */
    512 	memset((char *)opt_state->all_closure_sets, 0,
    513 	      opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
    514 
    515 	/* root->level is the highest level no found. */
    516 	for (level = root->level; level >= 0; --level) {
    517 		for (b = opt_state->levels[level]; b; b = b->link) {
    518 			SET_INSERT(b->closure, b->id);
    519 			if (JT(b) == 0)
    520 				continue;
    521 			SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
    522 			SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
    523 		}
    524 	}
    525 }
    526 
    527 /*
    528  * Return the register number that is used by s.
    529  *
    530  * Returns ATOM_A if A is used, ATOM_X if X is used, AX_ATOM if both A and X
    531  * are used, the scratch memory location's number if a scratch memory
    532  * location is used (e.g., 0 for M[0]), or -1 if none of those are used.
    533  *
    534  * The implementation should probably change to an array access.
    535  */
    536 static int
    537 atomuse(struct stmt *s)
    538 {
    539 	register int c = s->code;
    540 
    541 	if (c == NOP)
    542 		return -1;
    543 
    544 	switch (BPF_CLASS(c)) {
    545 
    546 	case BPF_RET:
    547 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
    548 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
    549 
    550 	case BPF_LD:
    551 	case BPF_LDX:
    552 		/*
    553 		 * As there are fewer than 2^31 memory locations,
    554 		 * s->k should be convertible to int without problems.
    555 		 */
    556 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
    557 			(BPF_MODE(c) == BPF_MEM) ? (int)s->k : -1;
    558 
    559 	case BPF_ST:
    560 		return A_ATOM;
    561 
    562 	case BPF_STX:
    563 		return X_ATOM;
    564 
    565 	case BPF_JMP:
    566 	case BPF_ALU:
    567 		if (BPF_SRC(c) == BPF_X)
    568 			return AX_ATOM;
    569 		return A_ATOM;
    570 
    571 	case BPF_MISC:
    572 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
    573 	}
    574 	abort();
    575 	/* NOTREACHED */
    576 }
    577 
    578 /*
    579  * Return the register number that is defined by 's'.  We assume that
    580  * a single stmt cannot define more than one register.  If no register
    581  * is defined, return -1.
    582  *
    583  * The implementation should probably change to an array access.
    584  */
    585 static int
    586 atomdef(struct stmt *s)
    587 {
    588 	if (s->code == NOP)
    589 		return -1;
    590 
    591 	switch (BPF_CLASS(s->code)) {
    592 
    593 	case BPF_LD:
    594 	case BPF_ALU:
    595 		return A_ATOM;
    596 
    597 	case BPF_LDX:
    598 		return X_ATOM;
    599 
    600 	case BPF_ST:
    601 	case BPF_STX:
    602 		return s->k;
    603 
    604 	case BPF_MISC:
    605 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
    606 	}
    607 	return -1;
    608 }
    609 
    610 /*
    611  * Compute the sets of registers used, defined, and killed by 'b'.
    612  *
    613  * "Used" means that a statement in 'b' uses the register before any
    614  * statement in 'b' defines it, i.e. it uses the value left in
    615  * that register by a predecessor block of this block.
    616  * "Defined" means that a statement in 'b' defines it.
    617  * "Killed" means that a statement in 'b' defines it before any
    618  * statement in 'b' uses it, i.e. it kills the value left in that
    619  * register by a predecessor block of this block.
    620  */
    621 static void
    622 compute_local_ud(struct block *b)
    623 {
    624 	struct slist *s;
    625 	atomset def = 0, use = 0, killed = 0;
    626 	int atom;
    627 
    628 	for (s = b->stmts; s; s = s->next) {
    629 		if (s->s.code == NOP)
    630 			continue;
    631 		atom = atomuse(&s->s);
    632 		if (atom >= 0) {
    633 			if (atom == AX_ATOM) {
    634 				if (!ATOMELEM(def, X_ATOM))
    635 					use |= ATOMMASK(X_ATOM);
    636 				if (!ATOMELEM(def, A_ATOM))
    637 					use |= ATOMMASK(A_ATOM);
    638 			}
    639 			else if (atom < N_ATOMS) {
    640 				if (!ATOMELEM(def, atom))
    641 					use |= ATOMMASK(atom);
    642 			}
    643 			else
    644 				abort();
    645 		}
    646 		atom = atomdef(&s->s);
    647 		if (atom >= 0) {
    648 			if (!ATOMELEM(use, atom))
    649 				killed |= ATOMMASK(atom);
    650 			def |= ATOMMASK(atom);
    651 		}
    652 	}
    653 	if (BPF_CLASS(b->s.code) == BPF_JMP) {
    654 		/*
    655 		 * XXX - what about RET?
    656 		 */
    657 		atom = atomuse(&b->s);
    658 		if (atom >= 0) {
    659 			if (atom == AX_ATOM) {
    660 				if (!ATOMELEM(def, X_ATOM))
    661 					use |= ATOMMASK(X_ATOM);
    662 				if (!ATOMELEM(def, A_ATOM))
    663 					use |= ATOMMASK(A_ATOM);
    664 			}
    665 			else if (atom < N_ATOMS) {
    666 				if (!ATOMELEM(def, atom))
    667 					use |= ATOMMASK(atom);
    668 			}
    669 			else
    670 				abort();
    671 		}
    672 	}
    673 
    674 	b->def = def;
    675 	b->kill = killed;
    676 	b->in_use = use;
    677 }
    678 
    679 /*
    680  * Assume graph is already leveled.
    681  */
    682 static void
    683 find_ud(opt_state_t *opt_state, struct block *root)
    684 {
    685 	int i, maxlevel;
    686 	struct block *p;
    687 
    688 	/*
    689 	 * root->level is the highest level no found;
    690 	 * count down from there.
    691 	 */
    692 	maxlevel = root->level;
    693 	for (i = maxlevel; i >= 0; --i)
    694 		for (p = opt_state->levels[i]; p; p = p->link) {
    695 			compute_local_ud(p);
    696 			p->out_use = 0;
    697 		}
    698 
    699 	for (i = 1; i <= maxlevel; ++i) {
    700 		for (p = opt_state->levels[i]; p; p = p->link) {
    701 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
    702 			p->in_use |= p->out_use &~ p->kill;
    703 		}
    704 	}
    705 }
    706 static void
    707 init_val(opt_state_t *opt_state)
    708 {
    709 	opt_state->curval = 0;
    710 	opt_state->next_vnode = opt_state->vnode_base;
    711 	memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
    712 	memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
    713 }
    714 
    715 /*
    716  * Because we really don't have an IR, this stuff is a little messy.
    717  *
    718  * This routine looks in the table of existing value number for a value
    719  * with generated from an operation with the specified opcode and
    720  * the specified values.  If it finds it, it returns its value number,
    721  * otherwise it makes a new entry in the table and returns the
    722  * value number of that entry.
    723  */
    724 static bpf_u_int32
    725 F(opt_state_t *opt_state, int code, bpf_u_int32 v0, bpf_u_int32 v1)
    726 {
    727 	u_int hash;
    728 	bpf_u_int32 val;
    729 	struct valnode *p;
    730 
    731 	hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
    732 	hash %= MODULUS;
    733 
    734 	for (p = opt_state->hashtbl[hash]; p; p = p->next)
    735 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
    736 			return p->val;
    737 
    738 	/*
    739 	 * Not found.  Allocate a new value, and assign it a new
    740 	 * value number.
    741 	 *
    742 	 * opt_state->curval starts out as 0, which means VAL_UNKNOWN; we
    743 	 * increment it before using it as the new value number, which
    744 	 * means we never assign VAL_UNKNOWN.
    745 	 *
    746 	 * XXX - unless we overflow, but we probably won't have 2^32-1
    747 	 * values; we treat 32 bits as effectively infinite.
    748 	 */
    749 	val = ++opt_state->curval;
    750 	if (BPF_MODE(code) == BPF_IMM &&
    751 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
    752 		opt_state->vmap[val].const_val = v0;
    753 		opt_state->vmap[val].is_const = 1;
    754 	}
    755 	p = opt_state->next_vnode++;
    756 	p->val = val;
    757 	p->code = code;
    758 	p->v0 = v0;
    759 	p->v1 = v1;
    760 	p->next = opt_state->hashtbl[hash];
    761 	opt_state->hashtbl[hash] = p;
    762 
    763 	return val;
    764 }
    765 
    766 static inline void
    767 vstore(struct stmt *s, bpf_u_int32 *valp, bpf_u_int32 newval, int alter)
    768 {
    769 	if (alter && newval != VAL_UNKNOWN && *valp == newval)
    770 		s->code = NOP;
    771 	else
    772 		*valp = newval;
    773 }
    774 
    775 /*
    776  * Do constant-folding on binary operators.
    777  * (Unary operators are handled elsewhere.)
    778  */
    779 static void
    780 fold_op(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 v0, bpf_u_int32 v1)
    781 {
    782 	bpf_u_int32 a, b;
    783 
    784 	a = opt_state->vmap[v0].const_val;
    785 	b = opt_state->vmap[v1].const_val;
    786 
    787 	switch (BPF_OP(s->code)) {
    788 	case BPF_ADD:
    789 		a += b;
    790 		break;
    791 
    792 	case BPF_SUB:
    793 		a -= b;
    794 		break;
    795 
    796 	case BPF_MUL:
    797 		a *= b;
    798 		break;
    799 
    800 	case BPF_DIV:
    801 		if (b == 0)
    802 			opt_error(opt_state, "division by zero");
    803 		a /= b;
    804 		break;
    805 
    806 	case BPF_MOD:
    807 		if (b == 0)
    808 			opt_error(opt_state, "modulus by zero");
    809 		a %= b;
    810 		break;
    811 
    812 	case BPF_AND:
    813 		a &= b;
    814 		break;
    815 
    816 	case BPF_OR:
    817 		a |= b;
    818 		break;
    819 
    820 	case BPF_XOR:
    821 		a ^= b;
    822 		break;
    823 
    824 	case BPF_LSH:
    825 		/*
    826 		 * A left shift of more than the width of the type
    827 		 * is undefined in C; we'll just treat it as shifting
    828 		 * all the bits out.
    829 		 *
    830 		 * XXX - the BPF interpreter doesn't check for this,
    831 		 * so its behavior is dependent on the behavior of
    832 		 * the processor on which it's running.  There are
    833 		 * processors on which it shifts all the bits out
    834 		 * and processors on which it does no shift.
    835 		 */
    836 		if (b < 32)
    837 			a <<= b;
    838 		else
    839 			a = 0;
    840 		break;
    841 
    842 	case BPF_RSH:
    843 		/*
    844 		 * A right shift of more than the width of the type
    845 		 * is undefined in C; we'll just treat it as shifting
    846 		 * all the bits out.
    847 		 *
    848 		 * XXX - the BPF interpreter doesn't check for this,
    849 		 * so its behavior is dependent on the behavior of
    850 		 * the processor on which it's running.  There are
    851 		 * processors on which it shifts all the bits out
    852 		 * and processors on which it does no shift.
    853 		 */
    854 		if (b < 32)
    855 			a >>= b;
    856 		else
    857 			a = 0;
    858 		break;
    859 
    860 	default:
    861 		abort();
    862 	}
    863 	s->k = a;
    864 	s->code = BPF_LD|BPF_IMM;
    865 	/*
    866 	 * XXX - optimizer loop detection.
    867 	 */
    868 	opt_state->non_branch_movement_performed = 1;
    869 	opt_state->done = 0;
    870 }
    871 
    872 static inline struct slist *
    873 this_op(struct slist *s)
    874 {
    875 	while (s != 0 && s->s.code == NOP)
    876 		s = s->next;
    877 	return s;
    878 }
    879 
    880 static void
    881 opt_not(struct block *b)
    882 {
    883 	struct block *tmp = JT(b);
    884 
    885 	JT(b) = JF(b);
    886 	JF(b) = tmp;
    887 }
    888 
    889 static void
    890 opt_peep(opt_state_t *opt_state, struct block *b)
    891 {
    892 	struct slist *s;
    893 	struct slist *next, *last;
    894 	bpf_u_int32 val;
    895 
    896 	s = b->stmts;
    897 	if (s == 0)
    898 		return;
    899 
    900 	last = s;
    901 	for (/*empty*/; /*empty*/; s = next) {
    902 		/*
    903 		 * Skip over nops.
    904 		 */
    905 		s = this_op(s);
    906 		if (s == 0)
    907 			break;	/* nothing left in the block */
    908 
    909 		/*
    910 		 * Find the next real instruction after that one
    911 		 * (skipping nops).
    912 		 */
    913 		next = this_op(s->next);
    914 		if (next == 0)
    915 			break;	/* no next instruction */
    916 		last = next;
    917 
    918 		/*
    919 		 * st  M[k]	-->	st  M[k]
    920 		 * ldx M[k]		tax
    921 		 */
    922 		if (s->s.code == BPF_ST &&
    923 		    next->s.code == (BPF_LDX|BPF_MEM) &&
    924 		    s->s.k == next->s.k) {
    925 			/*
    926 			 * XXX - optimizer loop detection.
    927 			 */
    928 			opt_state->non_branch_movement_performed = 1;
    929 			opt_state->done = 0;
    930 			next->s.code = BPF_MISC|BPF_TAX;
    931 		}
    932 		/*
    933 		 * ld  #k	-->	ldx  #k
    934 		 * tax			txa
    935 		 */
    936 		if (s->s.code == (BPF_LD|BPF_IMM) &&
    937 		    next->s.code == (BPF_MISC|BPF_TAX)) {
    938 			s->s.code = BPF_LDX|BPF_IMM;
    939 			next->s.code = BPF_MISC|BPF_TXA;
    940 			/*
    941 			 * XXX - optimizer loop detection.
    942 			 */
    943 			opt_state->non_branch_movement_performed = 1;
    944 			opt_state->done = 0;
    945 		}
    946 		/*
    947 		 * This is an ugly special case, but it happens
    948 		 * when you say tcp[k] or udp[k] where k is a constant.
    949 		 */
    950 		if (s->s.code == (BPF_LD|BPF_IMM)) {
    951 			struct slist *add, *tax, *ild;
    952 
    953 			/*
    954 			 * Check that X isn't used on exit from this
    955 			 * block (which the optimizer might cause).
    956 			 * We know the code generator won't generate
    957 			 * any local dependencies.
    958 			 */
    959 			if (ATOMELEM(b->out_use, X_ATOM))
    960 				continue;
    961 
    962 			/*
    963 			 * Check that the instruction following the ldi
    964 			 * is an addx, or it's an ldxms with an addx
    965 			 * following it (with 0 or more nops between the
    966 			 * ldxms and addx).
    967 			 */
    968 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
    969 				add = next;
    970 			else
    971 				add = this_op(next->next);
    972 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
    973 				continue;
    974 
    975 			/*
    976 			 * Check that a tax follows that (with 0 or more
    977 			 * nops between them).
    978 			 */
    979 			tax = this_op(add->next);
    980 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
    981 				continue;
    982 
    983 			/*
    984 			 * Check that an ild follows that (with 0 or more
    985 			 * nops between them).
    986 			 */
    987 			ild = this_op(tax->next);
    988 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
    989 			    BPF_MODE(ild->s.code) != BPF_IND)
    990 				continue;
    991 			/*
    992 			 * We want to turn this sequence:
    993 			 *
    994 			 * (004) ldi     #0x2		{s}
    995 			 * (005) ldxms   [14]		{next}  -- optional
    996 			 * (006) addx			{add}
    997 			 * (007) tax			{tax}
    998 			 * (008) ild     [x+0]		{ild}
    999 			 *
   1000 			 * into this sequence:
   1001 			 *
   1002 			 * (004) nop
   1003 			 * (005) ldxms   [14]
   1004 			 * (006) nop
   1005 			 * (007) nop
   1006 			 * (008) ild     [x+2]
   1007 			 *
   1008 			 * XXX We need to check that X is not
   1009 			 * subsequently used, because we want to change
   1010 			 * what'll be in it after this sequence.
   1011 			 *
   1012 			 * We know we can eliminate the accumulator
   1013 			 * modifications earlier in the sequence since
   1014 			 * it is defined by the last stmt of this sequence
   1015 			 * (i.e., the last statement of the sequence loads
   1016 			 * a value into the accumulator, so we can eliminate
   1017 			 * earlier operations on the accumulator).
   1018 			 */
   1019 			ild->s.k += s->s.k;
   1020 			s->s.code = NOP;
   1021 			add->s.code = NOP;
   1022 			tax->s.code = NOP;
   1023 			/*
   1024 			 * XXX - optimizer loop detection.
   1025 			 */
   1026 			opt_state->non_branch_movement_performed = 1;
   1027 			opt_state->done = 0;
   1028 		}
   1029 	}
   1030 	/*
   1031 	 * If the comparison at the end of a block is an equality
   1032 	 * comparison against a constant, and nobody uses the value
   1033 	 * we leave in the A register at the end of a block, and
   1034 	 * the operation preceding the comparison is an arithmetic
   1035 	 * operation, we can sometime optimize it away.
   1036 	 */
   1037 	if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
   1038 	    !ATOMELEM(b->out_use, A_ATOM)) {
   1039 		/*
   1040 		 * We can optimize away certain subtractions of the
   1041 		 * X register.
   1042 		 */
   1043 		if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
   1044 			val = b->val[X_ATOM];
   1045 			if (opt_state->vmap[val].is_const) {
   1046 				/*
   1047 				 * If we have a subtract to do a comparison,
   1048 				 * and the X register is a known constant,
   1049 				 * we can merge this value into the
   1050 				 * comparison:
   1051 				 *
   1052 				 * sub x  ->	nop
   1053 				 * jeq #y	jeq #(x+y)
   1054 				 */
   1055 				b->s.k += opt_state->vmap[val].const_val;
   1056 				last->s.code = NOP;
   1057 				/*
   1058 				 * XXX - optimizer loop detection.
   1059 				 */
   1060 				opt_state->non_branch_movement_performed = 1;
   1061 				opt_state->done = 0;
   1062 			} else if (b->s.k == 0) {
   1063 				/*
   1064 				 * If the X register isn't a constant,
   1065 				 * and the comparison in the test is
   1066 				 * against 0, we can compare with the
   1067 				 * X register, instead:
   1068 				 *
   1069 				 * sub x  ->	nop
   1070 				 * jeq #0	jeq x
   1071 				 */
   1072 				last->s.code = NOP;
   1073 				b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
   1074 				/*
   1075 				 * XXX - optimizer loop detection.
   1076 				 */
   1077 				opt_state->non_branch_movement_performed = 1;
   1078 				opt_state->done = 0;
   1079 			}
   1080 		}
   1081 		/*
   1082 		 * Likewise, a constant subtract can be simplified:
   1083 		 *
   1084 		 * sub #x ->	nop
   1085 		 * jeq #y ->	jeq #(x+y)
   1086 		 */
   1087 		else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
   1088 			last->s.code = NOP;
   1089 			b->s.k += last->s.k;
   1090 			/*
   1091 			 * XXX - optimizer loop detection.
   1092 			 */
   1093 			opt_state->non_branch_movement_performed = 1;
   1094 			opt_state->done = 0;
   1095 		}
   1096 		/*
   1097 		 * And, similarly, a constant AND can be simplified
   1098 		 * if we're testing against 0, i.e.:
   1099 		 *
   1100 		 * and #k	nop
   1101 		 * jeq #0  ->	jset #k
   1102 		 */
   1103 		else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
   1104 		    b->s.k == 0) {
   1105 			b->s.k = last->s.k;
   1106 			b->s.code = BPF_JMP|BPF_K|BPF_JSET;
   1107 			last->s.code = NOP;
   1108 			/*
   1109 			 * XXX - optimizer loop detection.
   1110 			 */
   1111 			opt_state->non_branch_movement_performed = 1;
   1112 			opt_state->done = 0;
   1113 			opt_not(b);
   1114 		}
   1115 	}
   1116 	/*
   1117 	 * jset #0        ->   never
   1118 	 * jset #ffffffff ->   always
   1119 	 */
   1120 	if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
   1121 		if (b->s.k == 0)
   1122 			JT(b) = JF(b);
   1123 		if (b->s.k == 0xffffffffU)
   1124 			JF(b) = JT(b);
   1125 	}
   1126 	/*
   1127 	 * If we're comparing against the index register, and the index
   1128 	 * register is a known constant, we can just compare against that
   1129 	 * constant.
   1130 	 */
   1131 	val = b->val[X_ATOM];
   1132 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
   1133 		bpf_u_int32 v = opt_state->vmap[val].const_val;
   1134 		b->s.code &= ~BPF_X;
   1135 		b->s.k = v;
   1136 	}
   1137 	/*
   1138 	 * If the accumulator is a known constant, we can compute the
   1139 	 * comparison result.
   1140 	 */
   1141 	val = b->val[A_ATOM];
   1142 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
   1143 		bpf_u_int32 v = opt_state->vmap[val].const_val;
   1144 		switch (BPF_OP(b->s.code)) {
   1145 
   1146 		case BPF_JEQ:
   1147 			v = v == b->s.k;
   1148 			break;
   1149 
   1150 		case BPF_JGT:
   1151 			v = v > b->s.k;
   1152 			break;
   1153 
   1154 		case BPF_JGE:
   1155 			v = v >= b->s.k;
   1156 			break;
   1157 
   1158 		case BPF_JSET:
   1159 			v &= b->s.k;
   1160 			break;
   1161 
   1162 		default:
   1163 			abort();
   1164 		}
   1165 		if (JF(b) != JT(b)) {
   1166 			/*
   1167 			 * XXX - optimizer loop detection.
   1168 			 */
   1169 			opt_state->non_branch_movement_performed = 1;
   1170 			opt_state->done = 0;
   1171 		}
   1172 		if (v)
   1173 			JF(b) = JT(b);
   1174 		else
   1175 			JT(b) = JF(b);
   1176 	}
   1177 }
   1178 
   1179 /*
   1180  * Compute the symbolic value of expression of 's', and update
   1181  * anything it defines in the value table 'val'.  If 'alter' is true,
   1182  * do various optimizations.  This code would be cleaner if symbolic
   1183  * evaluation and code transformations weren't folded together.
   1184  */
   1185 static void
   1186 opt_stmt(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 val[], int alter)
   1187 {
   1188 	int op;
   1189 	bpf_u_int32 v;
   1190 
   1191 	switch (s->code) {
   1192 
   1193 	case BPF_LD|BPF_ABS|BPF_W:
   1194 	case BPF_LD|BPF_ABS|BPF_H:
   1195 	case BPF_LD|BPF_ABS|BPF_B:
   1196 		v = F(opt_state, s->code, s->k, 0L);
   1197 		vstore(s, &val[A_ATOM], v, alter);
   1198 		break;
   1199 
   1200 	case BPF_LD|BPF_IND|BPF_W:
   1201 	case BPF_LD|BPF_IND|BPF_H:
   1202 	case BPF_LD|BPF_IND|BPF_B:
   1203 		v = val[X_ATOM];
   1204 		if (alter && opt_state->vmap[v].is_const) {
   1205 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
   1206 			s->k += opt_state->vmap[v].const_val;
   1207 			v = F(opt_state, s->code, s->k, 0L);
   1208 			/*
   1209 			 * XXX - optimizer loop detection.
   1210 			 */
   1211 			opt_state->non_branch_movement_performed = 1;
   1212 			opt_state->done = 0;
   1213 		}
   1214 		else
   1215 			v = F(opt_state, s->code, s->k, v);
   1216 		vstore(s, &val[A_ATOM], v, alter);
   1217 		break;
   1218 
   1219 	case BPF_LD|BPF_LEN:
   1220 		v = F(opt_state, s->code, 0L, 0L);
   1221 		vstore(s, &val[A_ATOM], v, alter);
   1222 		break;
   1223 
   1224 	case BPF_LD|BPF_IMM:
   1225 		v = K(s->k);
   1226 		vstore(s, &val[A_ATOM], v, alter);
   1227 		break;
   1228 
   1229 	case BPF_LDX|BPF_IMM:
   1230 		v = K(s->k);
   1231 		vstore(s, &val[X_ATOM], v, alter);
   1232 		break;
   1233 
   1234 	case BPF_LDX|BPF_MSH|BPF_B:
   1235 		v = F(opt_state, s->code, s->k, 0L);
   1236 		vstore(s, &val[X_ATOM], v, alter);
   1237 		break;
   1238 
   1239 	case BPF_ALU|BPF_NEG:
   1240 		if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
   1241 			s->code = BPF_LD|BPF_IMM;
   1242 			/*
   1243 			 * Do this negation as unsigned arithmetic; that's
   1244 			 * what modern BPF engines do, and it guarantees
   1245 			 * that all possible values can be negated.  (Yeah,
   1246 			 * negating 0x80000000, the minimum signed 32-bit
   1247 			 * two's-complement value, results in 0x80000000,
   1248 			 * so it's still negative, but we *should* be doing
   1249 			 * all unsigned arithmetic here, to match what
   1250 			 * modern BPF engines do.)
   1251 			 *
   1252 			 * Express it as 0U - (unsigned value) so that we
   1253 			 * don't get compiler warnings about negating an
   1254 			 * unsigned value and don't get UBSan warnings
   1255 			 * about the result of negating 0x80000000 being
   1256 			 * undefined.
   1257 			 */
   1258 			s->k = 0U - opt_state->vmap[val[A_ATOM]].const_val;
   1259 			val[A_ATOM] = K(s->k);
   1260 		}
   1261 		else
   1262 			val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
   1263 		break;
   1264 
   1265 	case BPF_ALU|BPF_ADD|BPF_K:
   1266 	case BPF_ALU|BPF_SUB|BPF_K:
   1267 	case BPF_ALU|BPF_MUL|BPF_K:
   1268 	case BPF_ALU|BPF_DIV|BPF_K:
   1269 	case BPF_ALU|BPF_MOD|BPF_K:
   1270 	case BPF_ALU|BPF_AND|BPF_K:
   1271 	case BPF_ALU|BPF_OR|BPF_K:
   1272 	case BPF_ALU|BPF_XOR|BPF_K:
   1273 	case BPF_ALU|BPF_LSH|BPF_K:
   1274 	case BPF_ALU|BPF_RSH|BPF_K:
   1275 		op = BPF_OP(s->code);
   1276 		if (alter) {
   1277 			if (s->k == 0) {
   1278 				/*
   1279 				 * Optimize operations where the constant
   1280 				 * is zero.
   1281 				 *
   1282 				 * Don't optimize away "sub #0"
   1283 				 * as it may be needed later to
   1284 				 * fixup the generated math code.
   1285 				 *
   1286 				 * Fail if we're dividing by zero or taking
   1287 				 * a modulus by zero.
   1288 				 */
   1289 				if (op == BPF_ADD ||
   1290 				    op == BPF_LSH || op == BPF_RSH ||
   1291 				    op == BPF_OR || op == BPF_XOR) {
   1292 					s->code = NOP;
   1293 					break;
   1294 				}
   1295 				if (op == BPF_MUL || op == BPF_AND) {
   1296 					s->code = BPF_LD|BPF_IMM;
   1297 					val[A_ATOM] = K(s->k);
   1298 					break;
   1299 				}
   1300 				if (op == BPF_DIV)
   1301 					opt_error(opt_state,
   1302 					    "division by zero");
   1303 				if (op == BPF_MOD)
   1304 					opt_error(opt_state,
   1305 					    "modulus by zero");
   1306 			}
   1307 			if (opt_state->vmap[val[A_ATOM]].is_const) {
   1308 				fold_op(opt_state, s, val[A_ATOM], K(s->k));
   1309 				val[A_ATOM] = K(s->k);
   1310 				break;
   1311 			}
   1312 		}
   1313 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
   1314 		break;
   1315 
   1316 	case BPF_ALU|BPF_ADD|BPF_X:
   1317 	case BPF_ALU|BPF_SUB|BPF_X:
   1318 	case BPF_ALU|BPF_MUL|BPF_X:
   1319 	case BPF_ALU|BPF_DIV|BPF_X:
   1320 	case BPF_ALU|BPF_MOD|BPF_X:
   1321 	case BPF_ALU|BPF_AND|BPF_X:
   1322 	case BPF_ALU|BPF_OR|BPF_X:
   1323 	case BPF_ALU|BPF_XOR|BPF_X:
   1324 	case BPF_ALU|BPF_LSH|BPF_X:
   1325 	case BPF_ALU|BPF_RSH|BPF_X:
   1326 		op = BPF_OP(s->code);
   1327 		if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
   1328 			if (opt_state->vmap[val[A_ATOM]].is_const) {
   1329 				fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
   1330 				val[A_ATOM] = K(s->k);
   1331 			}
   1332 			else {
   1333 				s->code = BPF_ALU|BPF_K|op;
   1334 				s->k = opt_state->vmap[val[X_ATOM]].const_val;
   1335 				if ((op == BPF_LSH || op == BPF_RSH) &&
   1336 				    s->k > 31)
   1337 					opt_error(opt_state,
   1338 					    "shift by more than 31 bits");
   1339 				/*
   1340 				 * XXX - optimizer loop detection.
   1341 				 */
   1342 				opt_state->non_branch_movement_performed = 1;
   1343 				opt_state->done = 0;
   1344 				val[A_ATOM] =
   1345 					F(opt_state, s->code, val[A_ATOM], K(s->k));
   1346 			}
   1347 			break;
   1348 		}
   1349 		/*
   1350 		 * Check if we're doing something to an accumulator
   1351 		 * that is 0, and simplify.  This may not seem like
   1352 		 * much of a simplification but it could open up further
   1353 		 * optimizations.
   1354 		 * XXX We could also check for mul by 1, etc.
   1355 		 */
   1356 		if (alter && opt_state->vmap[val[A_ATOM]].is_const
   1357 		    && opt_state->vmap[val[A_ATOM]].const_val == 0) {
   1358 			if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
   1359 				s->code = BPF_MISC|BPF_TXA;
   1360 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
   1361 				break;
   1362 			}
   1363 			else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
   1364 				 op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
   1365 				s->code = BPF_LD|BPF_IMM;
   1366 				s->k = 0;
   1367 				vstore(s, &val[A_ATOM], K(s->k), alter);
   1368 				break;
   1369 			}
   1370 			else if (op == BPF_NEG) {
   1371 				s->code = NOP;
   1372 				break;
   1373 			}
   1374 		}
   1375 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
   1376 		break;
   1377 
   1378 	case BPF_MISC|BPF_TXA:
   1379 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
   1380 		break;
   1381 
   1382 	case BPF_LD|BPF_MEM:
   1383 		v = val[s->k];
   1384 		if (alter && opt_state->vmap[v].is_const) {
   1385 			s->code = BPF_LD|BPF_IMM;
   1386 			s->k = opt_state->vmap[v].const_val;
   1387 			/*
   1388 			 * XXX - optimizer loop detection.
   1389 			 */
   1390 			opt_state->non_branch_movement_performed = 1;
   1391 			opt_state->done = 0;
   1392 		}
   1393 		vstore(s, &val[A_ATOM], v, alter);
   1394 		break;
   1395 
   1396 	case BPF_MISC|BPF_TAX:
   1397 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
   1398 		break;
   1399 
   1400 	case BPF_LDX|BPF_MEM:
   1401 		v = val[s->k];
   1402 		if (alter && opt_state->vmap[v].is_const) {
   1403 			s->code = BPF_LDX|BPF_IMM;
   1404 			s->k = opt_state->vmap[v].const_val;
   1405 			/*
   1406 			 * XXX - optimizer loop detection.
   1407 			 */
   1408 			opt_state->non_branch_movement_performed = 1;
   1409 			opt_state->done = 0;
   1410 		}
   1411 		vstore(s, &val[X_ATOM], v, alter);
   1412 		break;
   1413 
   1414 	case BPF_ST:
   1415 		vstore(s, &val[s->k], val[A_ATOM], alter);
   1416 		break;
   1417 
   1418 	case BPF_STX:
   1419 		vstore(s, &val[s->k], val[X_ATOM], alter);
   1420 		break;
   1421 	}
   1422 }
   1423 
   1424 static void
   1425 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
   1426 {
   1427 	register int atom;
   1428 
   1429 	atom = atomuse(s);
   1430 	if (atom >= 0) {
   1431 		if (atom == AX_ATOM) {
   1432 			last[X_ATOM] = 0;
   1433 			last[A_ATOM] = 0;
   1434 		}
   1435 		else
   1436 			last[atom] = 0;
   1437 	}
   1438 	atom = atomdef(s);
   1439 	if (atom >= 0) {
   1440 		if (last[atom]) {
   1441 			/*
   1442 			 * XXX - optimizer loop detection.
   1443 			 */
   1444 			opt_state->non_branch_movement_performed = 1;
   1445 			opt_state->done = 0;
   1446 			last[atom]->code = NOP;
   1447 		}
   1448 		last[atom] = s;
   1449 	}
   1450 }
   1451 
   1452 static void
   1453 opt_deadstores(opt_state_t *opt_state, register struct block *b)
   1454 {
   1455 	register struct slist *s;
   1456 	register int atom;
   1457 	struct stmt *last[N_ATOMS];
   1458 
   1459 	memset((char *)last, 0, sizeof last);
   1460 
   1461 	for (s = b->stmts; s != 0; s = s->next)
   1462 		deadstmt(opt_state, &s->s, last);
   1463 	deadstmt(opt_state, &b->s, last);
   1464 
   1465 	for (atom = 0; atom < N_ATOMS; ++atom)
   1466 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
   1467 			last[atom]->code = NOP;
   1468 			/*
   1469 			 * The store was removed as it's dead,
   1470 			 * so the value stored into now has
   1471 			 * an unknown value.
   1472 			 */
   1473 			vstore(0, &b->val[atom], VAL_UNKNOWN, 0);
   1474 			/*
   1475 			 * XXX - optimizer loop detection.
   1476 			 */
   1477 			opt_state->non_branch_movement_performed = 1;
   1478 			opt_state->done = 0;
   1479 		}
   1480 }
   1481 
   1482 static void
   1483 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
   1484 {
   1485 	struct slist *s;
   1486 	struct edge *p;
   1487 	int i;
   1488 	bpf_u_int32 aval, xval;
   1489 
   1490 #if 0
   1491 	for (s = b->stmts; s && s->next; s = s->next)
   1492 		if (BPF_CLASS(s->s.code) == BPF_JMP) {
   1493 			do_stmts = 0;
   1494 			break;
   1495 		}
   1496 #endif
   1497 
   1498 	/*
   1499 	 * Initialize the atom values.
   1500 	 */
   1501 	p = b->in_edges;
   1502 	if (p == 0) {
   1503 		/*
   1504 		 * We have no predecessors, so everything is undefined
   1505 		 * upon entry to this block.
   1506 		 */
   1507 		memset((char *)b->val, 0, sizeof(b->val));
   1508 	} else {
   1509 		/*
   1510 		 * Inherit values from our predecessors.
   1511 		 *
   1512 		 * First, get the values from the predecessor along the
   1513 		 * first edge leading to this node.
   1514 		 */
   1515 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
   1516 		/*
   1517 		 * Now look at all the other nodes leading to this node.
   1518 		 * If, for the predecessor along that edge, a register
   1519 		 * has a different value from the one we have (i.e.,
   1520 		 * control paths are merging, and the merging paths
   1521 		 * assign different values to that register), give the
   1522 		 * register the undefined value of 0.
   1523 		 */
   1524 		while ((p = p->next) != NULL) {
   1525 			for (i = 0; i < N_ATOMS; ++i)
   1526 				if (b->val[i] != p->pred->val[i])
   1527 					b->val[i] = 0;
   1528 		}
   1529 	}
   1530 	aval = b->val[A_ATOM];
   1531 	xval = b->val[X_ATOM];
   1532 	for (s = b->stmts; s; s = s->next)
   1533 		opt_stmt(opt_state, &s->s, b->val, do_stmts);
   1534 
   1535 	/*
   1536 	 * This is a special case: if we don't use anything from this
   1537 	 * block, and we load the accumulator or index register with a
   1538 	 * value that is already there, or if this block is a return,
   1539 	 * eliminate all the statements.
   1540 	 *
   1541 	 * XXX - what if it does a store?  Presumably that falls under
   1542 	 * the heading of "if we don't use anything from this block",
   1543 	 * i.e., if we use any memory location set to a different
   1544 	 * value by this block, then we use something from this block.
   1545 	 *
   1546 	 * XXX - why does it matter whether we use anything from this
   1547 	 * block?  If the accumulator or index register doesn't change
   1548 	 * its value, isn't that OK even if we use that value?
   1549 	 *
   1550 	 * XXX - if we load the accumulator with a different value,
   1551 	 * and the block ends with a conditional branch, we obviously
   1552 	 * can't eliminate it, as the branch depends on that value.
   1553 	 * For the index register, the conditional branch only depends
   1554 	 * on the index register value if the test is against the index
   1555 	 * register value rather than a constant; if nothing uses the
   1556 	 * value we put into the index register, and we're not testing
   1557 	 * against the index register's value, and there aren't any
   1558 	 * other problems that would keep us from eliminating this
   1559 	 * block, can we eliminate it?
   1560 	 */
   1561 	if (do_stmts &&
   1562 	    ((b->out_use == 0 &&
   1563 	      aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
   1564 	      xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
   1565 	     BPF_CLASS(b->s.code) == BPF_RET)) {
   1566 		if (b->stmts != 0) {
   1567 			b->stmts = 0;
   1568 			/*
   1569 			 * XXX - optimizer loop detection.
   1570 			 */
   1571 			opt_state->non_branch_movement_performed = 1;
   1572 			opt_state->done = 0;
   1573 		}
   1574 	} else {
   1575 		opt_peep(opt_state, b);
   1576 		opt_deadstores(opt_state, b);
   1577 	}
   1578 	/*
   1579 	 * Set up values for branch optimizer.
   1580 	 */
   1581 	if (BPF_SRC(b->s.code) == BPF_K)
   1582 		b->oval = K(b->s.k);
   1583 	else
   1584 		b->oval = b->val[X_ATOM];
   1585 	b->et.code = b->s.code;
   1586 	b->ef.code = -b->s.code;
   1587 }
   1588 
   1589 /*
   1590  * Return true if any register that is used on exit from 'succ', has
   1591  * an exit value that is different from the corresponding exit value
   1592  * from 'b'.
   1593  */
   1594 static int
   1595 use_conflict(struct block *b, struct block *succ)
   1596 {
   1597 	int atom;
   1598 	atomset use = succ->out_use;
   1599 
   1600 	if (use == 0)
   1601 		return 0;
   1602 
   1603 	for (atom = 0; atom < N_ATOMS; ++atom)
   1604 		if (ATOMELEM(use, atom))
   1605 			if (b->val[atom] != succ->val[atom])
   1606 				return 1;
   1607 	return 0;
   1608 }
   1609 
   1610 /*
   1611  * Given a block that is the successor of an edge, and an edge that
   1612  * dominates that edge, return either a pointer to a child of that
   1613  * block (a block to which that block jumps) if that block is a
   1614  * candidate to replace the successor of the latter edge or NULL
   1615  * if neither of the children of the first block are candidates.
   1616  */
   1617 static struct block *
   1618 fold_edge(struct block *child, struct edge *ep)
   1619 {
   1620 	int sense;
   1621 	bpf_u_int32 aval0, aval1, oval0, oval1;
   1622 	int code = ep->code;
   1623 
   1624 	if (code < 0) {
   1625 		/*
   1626 		 * This edge is a "branch if false" edge.
   1627 		 */
   1628 		code = -code;
   1629 		sense = 0;
   1630 	} else {
   1631 		/*
   1632 		 * This edge is a "branch if true" edge.
   1633 		 */
   1634 		sense = 1;
   1635 	}
   1636 
   1637 	/*
   1638 	 * If the opcode for the branch at the end of the block we
   1639 	 * were handed isn't the same as the opcode for the branch
   1640 	 * to which the edge we were handed corresponds, the tests
   1641 	 * for those branches aren't testing the same conditions,
   1642 	 * so the blocks to which the first block branches aren't
   1643 	 * candidates to replace the successor of the edge.
   1644 	 */
   1645 	if (child->s.code != code)
   1646 		return 0;
   1647 
   1648 	aval0 = child->val[A_ATOM];
   1649 	oval0 = child->oval;
   1650 	aval1 = ep->pred->val[A_ATOM];
   1651 	oval1 = ep->pred->oval;
   1652 
   1653 	/*
   1654 	 * If the A register value on exit from the successor block
   1655 	 * isn't the same as the A register value on exit from the
   1656 	 * predecessor of the edge, the blocks to which the first
   1657 	 * block branches aren't candidates to replace the successor
   1658 	 * of the edge.
   1659 	 */
   1660 	if (aval0 != aval1)
   1661 		return 0;
   1662 
   1663 	if (oval0 == oval1)
   1664 		/*
   1665 		 * The operands of the branch instructions are
   1666 		 * identical, so the branches are testing the
   1667 		 * same condition, and the result is true if a true
   1668 		 * branch was taken to get here, otherwise false.
   1669 		 */
   1670 		return sense ? JT(child) : JF(child);
   1671 
   1672 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
   1673 		/*
   1674 		 * At this point, we only know the comparison if we
   1675 		 * came down the true branch, and it was an equality
   1676 		 * comparison with a constant.
   1677 		 *
   1678 		 * I.e., if we came down the true branch, and the branch
   1679 		 * was an equality comparison with a constant, we know the
   1680 		 * accumulator contains that constant.  If we came down
   1681 		 * the false branch, or the comparison wasn't with a
   1682 		 * constant, we don't know what was in the accumulator.
   1683 		 *
   1684 		 * We rely on the fact that distinct constants have distinct
   1685 		 * value numbers.
   1686 		 */
   1687 		return JF(child);
   1688 
   1689 	return 0;
   1690 }
   1691 
   1692 /*
   1693  * If we can make this edge go directly to a child of the edge's current
   1694  * successor, do so.
   1695  */
   1696 static void
   1697 opt_j(opt_state_t *opt_state, struct edge *ep)
   1698 {
   1699 	register u_int i, k;
   1700 	register struct block *target;
   1701 
   1702 	/*
   1703 	 * Does this edge go to a block where, if the test
   1704 	 * at the end of it succeeds, it goes to a block
   1705 	 * that's a leaf node of the DAG, i.e. a return
   1706 	 * statement?
   1707 	 * If so, there's nothing to optimize.
   1708 	 */
   1709 	if (JT(ep->succ) == 0)
   1710 		return;
   1711 
   1712 	/*
   1713 	 * Does this edge go to a block that goes, in turn, to
   1714 	 * the same block regardless of whether the test at the
   1715 	 * end succeeds or fails?
   1716 	 */
   1717 	if (JT(ep->succ) == JF(ep->succ)) {
   1718 		/*
   1719 		 * Common branch targets can be eliminated, provided
   1720 		 * there is no data dependency.
   1721 		 *
   1722 		 * Check whether any register used on exit from the
   1723 		 * block to which the successor of this edge goes
   1724 		 * has a value at that point that's different from
   1725 		 * the value it has on exit from the predecessor of
   1726 		 * this edge.  If not, the predecessor of this edge
   1727 		 * can just go to the block to which the successor
   1728 		 * of this edge goes, bypassing the successor of this
   1729 		 * edge, as the successor of this edge isn't doing
   1730 		 * any calculations whose results are different
   1731 		 * from what the blocks before it did and isn't
   1732 		 * doing any tests the results of which matter.
   1733 		 */
   1734 		if (!use_conflict(ep->pred, JT(ep->succ))) {
   1735 			/*
   1736 			 * No, there isn't.
   1737 			 * Make this edge go to the block to
   1738 			 * which the successor of that edge
   1739 			 * goes.
   1740 			 *
   1741 			 * XXX - optimizer loop detection.
   1742 			 */
   1743 			opt_state->non_branch_movement_performed = 1;
   1744 			opt_state->done = 0;
   1745 			ep->succ = JT(ep->succ);
   1746 		}
   1747 	}
   1748 	/*
   1749 	 * For each edge dominator that matches the successor of this
   1750 	 * edge, promote the edge successor to the its grandchild.
   1751 	 *
   1752 	 * XXX We violate the set abstraction here in favor a reasonably
   1753 	 * efficient loop.
   1754 	 */
   1755  top:
   1756 	for (i = 0; i < opt_state->edgewords; ++i) {
   1757 		/* i'th word in the bitset of dominators */
   1758 		register bpf_u_int32 x = ep->edom[i];
   1759 
   1760 		while (x != 0) {
   1761 			/* Find the next dominator in that word and mark it as found */
   1762 			k = lowest_set_bit(x);
   1763 			x &=~ ((bpf_u_int32)1 << k);
   1764 			k += i * BITS_PER_WORD;
   1765 
   1766 			target = fold_edge(ep->succ, opt_state->edges[k]);
   1767 			/*
   1768 			 * We have a candidate to replace the successor
   1769 			 * of ep.
   1770 			 *
   1771 			 * Check that there is no data dependency between
   1772 			 * nodes that will be violated if we move the edge;
   1773 			 * i.e., if any register used on exit from the
   1774 			 * candidate has a value at that point different
   1775 			 * from the value it has when we exit the
   1776 			 * predecessor of that edge, there's a data
   1777 			 * dependency that will be violated.
   1778 			 */
   1779 			if (target != 0 && !use_conflict(ep->pred, target)) {
   1780 				/*
   1781 				 * It's safe to replace the successor of
   1782 				 * ep; do so, and note that we've made
   1783 				 * at least one change.
   1784 				 *
   1785 				 * XXX - this is one of the operations that
   1786 				 * happens when the optimizer gets into
   1787 				 * one of those infinite loops.
   1788 				 */
   1789 				opt_state->done = 0;
   1790 				ep->succ = target;
   1791 				if (JT(target) != 0)
   1792 					/*
   1793 					 * Start over unless we hit a leaf.
   1794 					 */
   1795 					goto top;
   1796 				return;
   1797 			}
   1798 		}
   1799 	}
   1800 }
   1801 
   1802 /*
   1803  * XXX - is this, and and_pullup(), what's described in section 6.1.2
   1804  * "Predicate Assertion Propagation" in the BPF+ paper?
   1805  *
   1806  * Note that this looks at block dominators, not edge dominators.
   1807  * Don't think so.
   1808  *
   1809  * "A or B" compiles into
   1810  *
   1811  *          A
   1812  *       t / \ f
   1813  *        /   B
   1814  *       / t / \ f
   1815  *      \   /
   1816  *       \ /
   1817  *        X
   1818  *
   1819  *
   1820  */
   1821 static void
   1822 or_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
   1823 {
   1824 	bpf_u_int32 val;
   1825 	int at_top;
   1826 	struct block *pull;
   1827 	struct block **diffp, **samep;
   1828 	struct edge *ep;
   1829 
   1830 	ep = b->in_edges;
   1831 	if (ep == 0)
   1832 		return;
   1833 
   1834 	/*
   1835 	 * Make sure each predecessor loads the same value.
   1836 	 * XXX why?
   1837 	 */
   1838 	val = ep->pred->val[A_ATOM];
   1839 	for (ep = ep->next; ep != 0; ep = ep->next)
   1840 		if (val != ep->pred->val[A_ATOM])
   1841 			return;
   1842 
   1843 	/*
   1844 	 * For the first edge in the list of edges coming into this block,
   1845 	 * see whether the predecessor of that edge comes here via a true
   1846 	 * branch or a false branch.
   1847 	 */
   1848 	if (JT(b->in_edges->pred) == b)
   1849 		diffp = &JT(b->in_edges->pred);	/* jt */
   1850 	else
   1851 		diffp = &JF(b->in_edges->pred);	/* jf */
   1852 
   1853 	/*
   1854 	 * diffp is a pointer to a pointer to the block.
   1855 	 *
   1856 	 * Go down the false chain looking as far as you can,
   1857 	 * making sure that each jump-compare is doing the
   1858 	 * same as the original block.
   1859 	 *
   1860 	 * If you reach the bottom before you reach a
   1861 	 * different jump-compare, just exit.  There's nothing
   1862 	 * to do here.  XXX - no, this version is checking for
   1863 	 * the value leaving the block; that's from the BPF+
   1864 	 * pullup routine.
   1865 	 */
   1866 	at_top = 1;
   1867 	for (;;) {
   1868 		/*
   1869 		 * Done if that's not going anywhere XXX
   1870 		 */
   1871 		if (*diffp == 0)
   1872 			return;
   1873 
   1874 		/*
   1875 		 * Done if that predecessor blah blah blah isn't
   1876 		 * going the same place we're going XXX
   1877 		 *
   1878 		 * Does the true edge of this block point to the same
   1879 		 * location as the true edge of b?
   1880 		 */
   1881 		if (JT(*diffp) != JT(b))
   1882 			return;
   1883 
   1884 		/*
   1885 		 * Done if this node isn't a dominator of that
   1886 		 * node blah blah blah XXX
   1887 		 *
   1888 		 * Does b dominate diffp?
   1889 		 */
   1890 		if (!SET_MEMBER((*diffp)->dom, b->id))
   1891 			return;
   1892 
   1893 		/*
   1894 		 * Break out of the loop if that node's value of A
   1895 		 * isn't the value of A above XXX
   1896 		 */
   1897 		if ((*diffp)->val[A_ATOM] != val)
   1898 			break;
   1899 
   1900 		/*
   1901 		 * Get the JF for that node XXX
   1902 		 * Go down the false path.
   1903 		 */
   1904 		diffp = &JF(*diffp);
   1905 		at_top = 0;
   1906 	}
   1907 
   1908 	/*
   1909 	 * Now that we've found a different jump-compare in a chain
   1910 	 * below b, search further down until we find another
   1911 	 * jump-compare that looks at the original value.  This
   1912 	 * jump-compare should get pulled up.  XXX again we're
   1913 	 * comparing values not jump-compares.
   1914 	 */
   1915 	samep = &JF(*diffp);
   1916 	for (;;) {
   1917 		/*
   1918 		 * Done if that's not going anywhere XXX
   1919 		 */
   1920 		if (*samep == 0)
   1921 			return;
   1922 
   1923 		/*
   1924 		 * Done if that predecessor blah blah blah isn't
   1925 		 * going the same place we're going XXX
   1926 		 */
   1927 		if (JT(*samep) != JT(b))
   1928 			return;
   1929 
   1930 		/*
   1931 		 * Done if this node isn't a dominator of that
   1932 		 * node blah blah blah XXX
   1933 		 *
   1934 		 * Does b dominate samep?
   1935 		 */
   1936 		if (!SET_MEMBER((*samep)->dom, b->id))
   1937 			return;
   1938 
   1939 		/*
   1940 		 * Break out of the loop if that node's value of A
   1941 		 * is the value of A above XXX
   1942 		 */
   1943 		if ((*samep)->val[A_ATOM] == val)
   1944 			break;
   1945 
   1946 		/* XXX Need to check that there are no data dependencies
   1947 		   between dp0 and dp1.  Currently, the code generator
   1948 		   will not produce such dependencies. */
   1949 		samep = &JF(*samep);
   1950 	}
   1951 #ifdef notdef
   1952 	/* XXX This doesn't cover everything. */
   1953 	for (i = 0; i < N_ATOMS; ++i)
   1954 		if ((*samep)->val[i] != pred->val[i])
   1955 			return;
   1956 #endif
   1957 	/* Pull up the node. */
   1958 	pull = *samep;
   1959 	*samep = JF(pull);
   1960 	JF(pull) = *diffp;
   1961 
   1962 	/*
   1963 	 * At the top of the chain, each predecessor needs to point at the
   1964 	 * pulled up node.  Inside the chain, there is only one predecessor
   1965 	 * to worry about.
   1966 	 */
   1967 	if (at_top) {
   1968 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
   1969 			if (JT(ep->pred) == b)
   1970 				JT(ep->pred) = pull;
   1971 			else
   1972 				JF(ep->pred) = pull;
   1973 		}
   1974 	}
   1975 	else
   1976 		*diffp = pull;
   1977 
   1978 	/*
   1979 	 * XXX - this is one of the operations that happens when the
   1980 	 * optimizer gets into one of those infinite loops.
   1981 	 */
   1982 	opt_state->done = 0;
   1983 
   1984 	/*
   1985 	 * Recompute dominator sets as control flow graph has changed.
   1986 	 */
   1987 	find_dom(opt_state, root);
   1988 }
   1989 
   1990 static void
   1991 and_pullup(opt_state_t *opt_state, struct block *b, struct block *root)
   1992 {
   1993 	bpf_u_int32 val;
   1994 	int at_top;
   1995 	struct block *pull;
   1996 	struct block **diffp, **samep;
   1997 	struct edge *ep;
   1998 
   1999 	ep = b->in_edges;
   2000 	if (ep == 0)
   2001 		return;
   2002 
   2003 	/*
   2004 	 * Make sure each predecessor loads the same value.
   2005 	 */
   2006 	val = ep->pred->val[A_ATOM];
   2007 	for (ep = ep->next; ep != 0; ep = ep->next)
   2008 		if (val != ep->pred->val[A_ATOM])
   2009 			return;
   2010 
   2011 	if (JT(b->in_edges->pred) == b)
   2012 		diffp = &JT(b->in_edges->pred);
   2013 	else
   2014 		diffp = &JF(b->in_edges->pred);
   2015 
   2016 	at_top = 1;
   2017 	for (;;) {
   2018 		if (*diffp == 0)
   2019 			return;
   2020 
   2021 		if (JF(*diffp) != JF(b))
   2022 			return;
   2023 
   2024 		if (!SET_MEMBER((*diffp)->dom, b->id))
   2025 			return;
   2026 
   2027 		if ((*diffp)->val[A_ATOM] != val)
   2028 			break;
   2029 
   2030 		diffp = &JT(*diffp);
   2031 		at_top = 0;
   2032 	}
   2033 	samep = &JT(*diffp);
   2034 	for (;;) {
   2035 		if (*samep == 0)
   2036 			return;
   2037 
   2038 		if (JF(*samep) != JF(b))
   2039 			return;
   2040 
   2041 		if (!SET_MEMBER((*samep)->dom, b->id))
   2042 			return;
   2043 
   2044 		if ((*samep)->val[A_ATOM] == val)
   2045 			break;
   2046 
   2047 		/* XXX Need to check that there are no data dependencies
   2048 		   between diffp and samep.  Currently, the code generator
   2049 		   will not produce such dependencies. */
   2050 		samep = &JT(*samep);
   2051 	}
   2052 #ifdef notdef
   2053 	/* XXX This doesn't cover everything. */
   2054 	for (i = 0; i < N_ATOMS; ++i)
   2055 		if ((*samep)->val[i] != pred->val[i])
   2056 			return;
   2057 #endif
   2058 	/* Pull up the node. */
   2059 	pull = *samep;
   2060 	*samep = JT(pull);
   2061 	JT(pull) = *diffp;
   2062 
   2063 	/*
   2064 	 * At the top of the chain, each predecessor needs to point at the
   2065 	 * pulled up node.  Inside the chain, there is only one predecessor
   2066 	 * to worry about.
   2067 	 */
   2068 	if (at_top) {
   2069 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
   2070 			if (JT(ep->pred) == b)
   2071 				JT(ep->pred) = pull;
   2072 			else
   2073 				JF(ep->pred) = pull;
   2074 		}
   2075 	}
   2076 	else
   2077 		*diffp = pull;
   2078 
   2079 	/*
   2080 	 * XXX - this is one of the operations that happens when the
   2081 	 * optimizer gets into one of those infinite loops.
   2082 	 */
   2083 	opt_state->done = 0;
   2084 
   2085 	/*
   2086 	 * Recompute dominator sets as control flow graph has changed.
   2087 	 */
   2088 	find_dom(opt_state, root);
   2089 }
   2090 
   2091 static void
   2092 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
   2093 {
   2094 	int i, maxlevel;
   2095 	struct block *p;
   2096 
   2097 	init_val(opt_state);
   2098 	maxlevel = ic->root->level;
   2099 
   2100 	find_inedges(opt_state, ic->root);
   2101 	for (i = maxlevel; i >= 0; --i)
   2102 		for (p = opt_state->levels[i]; p; p = p->link)
   2103 			opt_blk(opt_state, p, do_stmts);
   2104 
   2105 	if (do_stmts)
   2106 		/*
   2107 		 * No point trying to move branches; it can't possibly
   2108 		 * make a difference at this point.
   2109 		 *
   2110 		 * XXX - this might be after we detect a loop where
   2111 		 * we were just looping infinitely moving branches
   2112 		 * in such a fashion that we went through two or more
   2113 		 * versions of the machine code, eventually returning
   2114 		 * to the first version.  (We're really not doing a
   2115 		 * full loop detection, we're just testing for two
   2116 		 * passes in a row where we do nothing but
   2117 		 * move branches.)
   2118 		 */
   2119 		return;
   2120 
   2121 	/*
   2122 	 * Is this what the BPF+ paper describes in sections 6.1.1,
   2123 	 * 6.1.2, and 6.1.3?
   2124 	 */
   2125 	for (i = 1; i <= maxlevel; ++i) {
   2126 		for (p = opt_state->levels[i]; p; p = p->link) {
   2127 			opt_j(opt_state, &p->et);
   2128 			opt_j(opt_state, &p->ef);
   2129 		}
   2130 	}
   2131 
   2132 	find_inedges(opt_state, ic->root);
   2133 	for (i = 1; i <= maxlevel; ++i) {
   2134 		for (p = opt_state->levels[i]; p; p = p->link) {
   2135 			or_pullup(opt_state, p, ic->root);
   2136 			and_pullup(opt_state, p, ic->root);
   2137 		}
   2138 	}
   2139 }
   2140 
   2141 static inline void
   2142 link_inedge(struct edge *parent, struct block *child)
   2143 {
   2144 	parent->next = child->in_edges;
   2145 	child->in_edges = parent;
   2146 }
   2147 
   2148 static void
   2149 find_inedges(opt_state_t *opt_state, struct block *root)
   2150 {
   2151 	u_int i;
   2152 	int level;
   2153 	struct block *b;
   2154 
   2155 	for (i = 0; i < opt_state->n_blocks; ++i)
   2156 		opt_state->blocks[i]->in_edges = 0;
   2157 
   2158 	/*
   2159 	 * Traverse the graph, adding each edge to the predecessor
   2160 	 * list of its successors.  Skip the leaves (i.e. level 0).
   2161 	 */
   2162 	for (level = root->level; level > 0; --level) {
   2163 		for (b = opt_state->levels[level]; b != 0; b = b->link) {
   2164 			link_inedge(&b->et, JT(b));
   2165 			link_inedge(&b->ef, JF(b));
   2166 		}
   2167 	}
   2168 }
   2169 
   2170 static void
   2171 opt_root(struct block **b)
   2172 {
   2173 	struct slist *tmp, *s;
   2174 
   2175 	s = (*b)->stmts;
   2176 	(*b)->stmts = 0;
   2177 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
   2178 		*b = JT(*b);
   2179 
   2180 	tmp = (*b)->stmts;
   2181 	if (tmp != 0)
   2182 		sappend(s, tmp);
   2183 	(*b)->stmts = s;
   2184 
   2185 	/*
   2186 	 * If the root node is a return, then there is no
   2187 	 * point executing any statements (since the bpf machine
   2188 	 * has no side effects).
   2189 	 */
   2190 	if (BPF_CLASS((*b)->s.code) == BPF_RET)
   2191 		(*b)->stmts = 0;
   2192 }
   2193 
   2194 static void
   2195 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
   2196 {
   2197 
   2198 #ifdef BDEBUG
   2199 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   2200 		printf("opt_loop(root, %d) begin\n", do_stmts);
   2201 		opt_dump(opt_state, ic);
   2202 	}
   2203 #endif
   2204 
   2205 	/*
   2206 	 * XXX - optimizer loop detection.
   2207 	 */
   2208 	int loop_count = 0;
   2209 	for (;;) {
   2210 		opt_state->done = 1;
   2211 		/*
   2212 		 * XXX - optimizer loop detection.
   2213 		 */
   2214 		opt_state->non_branch_movement_performed = 0;
   2215 		find_levels(opt_state, ic);
   2216 		find_dom(opt_state, ic->root);
   2217 		find_closure(opt_state, ic->root);
   2218 		find_ud(opt_state, ic->root);
   2219 		find_edom(opt_state, ic->root);
   2220 		opt_blks(opt_state, ic, do_stmts);
   2221 #ifdef BDEBUG
   2222 		if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   2223 			printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
   2224 			opt_dump(opt_state, ic);
   2225 		}
   2226 #endif
   2227 
   2228 		/*
   2229 		 * Was anything done in this optimizer pass?
   2230 		 */
   2231 		if (opt_state->done) {
   2232 			/*
   2233 			 * No, so we've reached a fixed point.
   2234 			 * We're done.
   2235 			 */
   2236 			break;
   2237 		}
   2238 
   2239 		/*
   2240 		 * XXX - was anything done other than branch movement
   2241 		 * in this pass?
   2242 		 */
   2243 		if (opt_state->non_branch_movement_performed) {
   2244 			/*
   2245 			 * Yes.  Clear any loop-detection counter;
   2246 			 * we're making some form of progress (assuming
   2247 			 * we can't get into a cycle doing *other*
   2248 			 * optimizations...).
   2249 			 */
   2250 			loop_count = 0;
   2251 		} else {
   2252 			/*
   2253 			 * No - increment the counter, and quit if
   2254 			 * it's up to 100.
   2255 			 */
   2256 			loop_count++;
   2257 			if (loop_count >= 100) {
   2258 				/*
   2259 				 * We've done nothing but branch movement
   2260 				 * for 100 passes; we're probably
   2261 				 * in a cycle and will never reach a
   2262 				 * fixed point.
   2263 				 *
   2264 				 * XXX - yes, we really need a non-
   2265 				 * heuristic way of detecting a cycle.
   2266 				 */
   2267 				opt_state->done = 1;
   2268 				break;
   2269 			}
   2270 		}
   2271 	}
   2272 }
   2273 
   2274 /*
   2275  * Optimize the filter code in its dag representation.
   2276  * Return 0 on success, -1 on error.
   2277  */
   2278 int
   2279 bpf_optimize(struct icode *ic, char *errbuf)
   2280 {
   2281 	opt_state_t opt_state;
   2282 
   2283 	memset(&opt_state, 0, sizeof(opt_state));
   2284 	opt_state.errbuf = errbuf;
   2285 	opt_state.non_branch_movement_performed = 0;
   2286 	if (setjmp(opt_state.top_ctx)) {
   2287 		opt_cleanup(&opt_state);
   2288 		return -1;
   2289 	}
   2290 	opt_init(&opt_state, ic);
   2291 	opt_loop(&opt_state, ic, 0);
   2292 	opt_loop(&opt_state, ic, 1);
   2293 	intern_blocks(&opt_state, ic);
   2294 #ifdef BDEBUG
   2295 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   2296 		printf("after intern_blocks()\n");
   2297 		opt_dump(&opt_state, ic);
   2298 	}
   2299 #endif
   2300 	opt_root(&ic->root);
   2301 #ifdef BDEBUG
   2302 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
   2303 		printf("after opt_root()\n");
   2304 		opt_dump(&opt_state, ic);
   2305 	}
   2306 #endif
   2307 	opt_cleanup(&opt_state);
   2308 	return 0;
   2309 }
   2310 
   2311 static void
   2312 make_marks(struct icode *ic, struct block *p)
   2313 {
   2314 	if (!isMarked(ic, p)) {
   2315 		Mark(ic, p);
   2316 		if (BPF_CLASS(p->s.code) != BPF_RET) {
   2317 			make_marks(ic, JT(p));
   2318 			make_marks(ic, JF(p));
   2319 		}
   2320 	}
   2321 }
   2322 
   2323 /*
   2324  * Mark code array such that isMarked(ic->cur_mark, i) is true
   2325  * only for nodes that are alive.
   2326  */
   2327 static void
   2328 mark_code(struct icode *ic)
   2329 {
   2330 	ic->cur_mark += 1;
   2331 	make_marks(ic, ic->root);
   2332 }
   2333 
   2334 /*
   2335  * True iff the two stmt lists load the same value from the packet into
   2336  * the accumulator.
   2337  */
   2338 static int
   2339 eq_slist(struct slist *x, struct slist *y)
   2340 {
   2341 	for (;;) {
   2342 		while (x && x->s.code == NOP)
   2343 			x = x->next;
   2344 		while (y && y->s.code == NOP)
   2345 			y = y->next;
   2346 		if (x == 0)
   2347 			return y == 0;
   2348 		if (y == 0)
   2349 			return x == 0;
   2350 		if (x->s.code != y->s.code || x->s.k != y->s.k)
   2351 			return 0;
   2352 		x = x->next;
   2353 		y = y->next;
   2354 	}
   2355 }
   2356 
   2357 static inline int
   2358 eq_blk(struct block *b0, struct block *b1)
   2359 {
   2360 	if (b0->s.code == b1->s.code &&
   2361 	    b0->s.k == b1->s.k &&
   2362 	    b0->et.succ == b1->et.succ &&
   2363 	    b0->ef.succ == b1->ef.succ)
   2364 		return eq_slist(b0->stmts, b1->stmts);
   2365 	return 0;
   2366 }
   2367 
   2368 static void
   2369 intern_blocks(opt_state_t *opt_state, struct icode *ic)
   2370 {
   2371 	struct block *p;
   2372 	u_int i, j;
   2373 	int done1; /* don't shadow global */
   2374  top:
   2375 	done1 = 1;
   2376 	for (i = 0; i < opt_state->n_blocks; ++i)
   2377 		opt_state->blocks[i]->link = 0;
   2378 
   2379 	mark_code(ic);
   2380 
   2381 	for (i = opt_state->n_blocks - 1; i != 0; ) {
   2382 		--i;
   2383 		if (!isMarked(ic, opt_state->blocks[i]))
   2384 			continue;
   2385 		for (j = i + 1; j < opt_state->n_blocks; ++j) {
   2386 			if (!isMarked(ic, opt_state->blocks[j]))
   2387 				continue;
   2388 			if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
   2389 				opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
   2390 					opt_state->blocks[j]->link : opt_state->blocks[j];
   2391 				break;
   2392 			}
   2393 		}
   2394 	}
   2395 	for (i = 0; i < opt_state->n_blocks; ++i) {
   2396 		p = opt_state->blocks[i];
   2397 		if (JT(p) == 0)
   2398 			continue;
   2399 		if (JT(p)->link) {
   2400 			done1 = 0;
   2401 			JT(p) = JT(p)->link;
   2402 		}
   2403 		if (JF(p)->link) {
   2404 			done1 = 0;
   2405 			JF(p) = JF(p)->link;
   2406 		}
   2407 	}
   2408 	if (!done1)
   2409 		goto top;
   2410 }
   2411 
   2412 static void
   2413 opt_cleanup(opt_state_t *opt_state)
   2414 {
   2415 	free((void *)opt_state->vnode_base);
   2416 	free((void *)opt_state->vmap);
   2417 	free((void *)opt_state->edges);
   2418 	free((void *)opt_state->space);
   2419 	free((void *)opt_state->levels);
   2420 	free((void *)opt_state->blocks);
   2421 }
   2422 
   2423 /*
   2424  * For optimizer errors.
   2425  */
   2426 static void PCAP_NORETURN
   2427 opt_error(opt_state_t *opt_state, const char *fmt, ...)
   2428 {
   2429 	va_list ap;
   2430 
   2431 	if (opt_state->errbuf != NULL) {
   2432 		va_start(ap, fmt);
   2433 		(void)vsnprintf(opt_state->errbuf,
   2434 		    PCAP_ERRBUF_SIZE, fmt, ap);
   2435 		va_end(ap);
   2436 	}
   2437 	longjmp(opt_state->top_ctx, 1);
   2438 	/* NOTREACHED */
   2439 #ifdef _AIX
   2440 	PCAP_UNREACHABLE
   2441 #endif /* _AIX */
   2442 }
   2443 
   2444 /*
   2445  * Return the number of stmts in 's'.
   2446  */
   2447 static u_int
   2448 slength(struct slist *s)
   2449 {
   2450 	u_int n = 0;
   2451 
   2452 	for (; s; s = s->next)
   2453 		if (s->s.code != NOP)
   2454 			++n;
   2455 	return n;
   2456 }
   2457 
   2458 /*
   2459  * Return the number of nodes reachable by 'p'.
   2460  * All nodes should be initially unmarked.
   2461  */
   2462 static int
   2463 count_blocks(struct icode *ic, struct block *p)
   2464 {
   2465 	if (p == 0 || isMarked(ic, p))
   2466 		return 0;
   2467 	Mark(ic, p);
   2468 	return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
   2469 }
   2470 
   2471 /*
   2472  * Do a depth first search on the flow graph, numbering the
   2473  * the basic blocks, and entering them into the 'blocks' array.`
   2474  */
   2475 static void
   2476 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
   2477 {
   2478 	u_int n;
   2479 
   2480 	if (p == 0 || isMarked(ic, p))
   2481 		return;
   2482 
   2483 	Mark(ic, p);
   2484 	n = opt_state->n_blocks++;
   2485 	if (opt_state->n_blocks == 0) {
   2486 		/*
   2487 		 * Overflow.
   2488 		 */
   2489 		opt_error(opt_state, "filter is too complex to optimize");
   2490 	}
   2491 	p->id = n;
   2492 	opt_state->blocks[n] = p;
   2493 
   2494 	number_blks_r(opt_state, ic, JT(p));
   2495 	number_blks_r(opt_state, ic, JF(p));
   2496 }
   2497 
   2498 /*
   2499  * Return the number of stmts in the flowgraph reachable by 'p'.
   2500  * The nodes should be unmarked before calling.
   2501  *
   2502  * Note that "stmts" means "instructions", and that this includes
   2503  *
   2504  *	side-effect statements in 'p' (slength(p->stmts));
   2505  *
   2506  *	statements in the true branch from 'p' (count_stmts(JT(p)));
   2507  *
   2508  *	statements in the false branch from 'p' (count_stmts(JF(p)));
   2509  *
   2510  *	the conditional jump itself (1);
   2511  *
   2512  *	an extra long jump if the true branch requires it (p->longjt);
   2513  *
   2514  *	an extra long jump if the false branch requires it (p->longjf).
   2515  */
   2516 static u_int
   2517 count_stmts(struct icode *ic, struct block *p)
   2518 {
   2519 	u_int n;
   2520 
   2521 	if (p == 0 || isMarked(ic, p))
   2522 		return 0;
   2523 	Mark(ic, p);
   2524 	n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
   2525 	return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
   2526 }
   2527 
   2528 /*
   2529  * Allocate memory.  All allocation is done before optimization
   2530  * is begun.  A linear bound on the size of all data structures is computed
   2531  * from the total number of blocks and/or statements.
   2532  */
   2533 static void
   2534 opt_init(opt_state_t *opt_state, struct icode *ic)
   2535 {
   2536 	bpf_u_int32 *p;
   2537 	int i, n, max_stmts;
   2538 	u_int product;
   2539 	size_t block_memsize, edge_memsize;
   2540 
   2541 	/*
   2542 	 * First, count the blocks, so we can malloc an array to map
   2543 	 * block number to block.  Then, put the blocks into the array.
   2544 	 */
   2545 	unMarkAll(ic);
   2546 	n = count_blocks(ic, ic->root);
   2547 	opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
   2548 	if (opt_state->blocks == NULL)
   2549 		opt_error(opt_state, "malloc");
   2550 	unMarkAll(ic);
   2551 	opt_state->n_blocks = 0;
   2552 	number_blks_r(opt_state, ic, ic->root);
   2553 
   2554 	/*
   2555 	 * This "should not happen".
   2556 	 */
   2557 	if (opt_state->n_blocks == 0)
   2558 		opt_error(opt_state, "filter has no instructions; please report this as a libpcap issue");
   2559 
   2560 	opt_state->n_edges = 2 * opt_state->n_blocks;
   2561 	if ((opt_state->n_edges / 2) != opt_state->n_blocks) {
   2562 		/*
   2563 		 * Overflow.
   2564 		 */
   2565 		opt_error(opt_state, "filter is too complex to optimize");
   2566 	}
   2567 	opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
   2568 	if (opt_state->edges == NULL) {
   2569 		opt_error(opt_state, "malloc");
   2570 	}
   2571 
   2572 	/*
   2573 	 * The number of levels is bounded by the number of nodes.
   2574 	 */
   2575 	opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
   2576 	if (opt_state->levels == NULL) {
   2577 		opt_error(opt_state, "malloc");
   2578 	}
   2579 
   2580 	opt_state->edgewords = opt_state->n_edges / BITS_PER_WORD + 1;
   2581 	opt_state->nodewords = opt_state->n_blocks / BITS_PER_WORD + 1;
   2582 
   2583 	/*
   2584 	 * Make sure opt_state->n_blocks * opt_state->nodewords fits
   2585 	 * in a u_int; we use it as a u_int number-of-iterations
   2586 	 * value.
   2587 	 */
   2588 	product = opt_state->n_blocks * opt_state->nodewords;
   2589 	if ((product / opt_state->n_blocks) != opt_state->nodewords) {
   2590 		/*
   2591 		 * XXX - just punt and don't try to optimize?
   2592 		 * In practice, this is unlikely to happen with
   2593 		 * a normal filter.
   2594 		 */
   2595 		opt_error(opt_state, "filter is too complex to optimize");
   2596 	}
   2597 
   2598 	/*
   2599 	 * Make sure the total memory required for that doesn't
   2600 	 * overflow.
   2601 	 */
   2602 	block_memsize = (size_t)2 * product * sizeof(*opt_state->space);
   2603 	if ((block_memsize / product) != 2 * sizeof(*opt_state->space)) {
   2604 		opt_error(opt_state, "filter is too complex to optimize");
   2605 	}
   2606 
   2607 	/*
   2608 	 * Make sure opt_state->n_edges * opt_state->edgewords fits
   2609 	 * in a u_int; we use it as a u_int number-of-iterations
   2610 	 * value.
   2611 	 */
   2612 	product = opt_state->n_edges * opt_state->edgewords;
   2613 	if ((product / opt_state->n_edges) != opt_state->edgewords) {
   2614 		opt_error(opt_state, "filter is too complex to optimize");
   2615 	}
   2616 
   2617 	/*
   2618 	 * Make sure the total memory required for that doesn't
   2619 	 * overflow.
   2620 	 */
   2621 	edge_memsize = (size_t)product * sizeof(*opt_state->space);
   2622 	if (edge_memsize / product != sizeof(*opt_state->space)) {
   2623 		opt_error(opt_state, "filter is too complex to optimize");
   2624 	}
   2625 
   2626 	/*
   2627 	 * Make sure the total memory required for both of them doesn't
   2628 	 * overflow.
   2629 	 */
   2630 	if (block_memsize > SIZE_MAX - edge_memsize) {
   2631 		opt_error(opt_state, "filter is too complex to optimize");
   2632 	}
   2633 
   2634 	/* XXX */
   2635 	opt_state->space = (bpf_u_int32 *)malloc(block_memsize + edge_memsize);
   2636 	if (opt_state->space == NULL) {
   2637 		opt_error(opt_state, "malloc");
   2638 	}
   2639 	p = opt_state->space;
   2640 	opt_state->all_dom_sets = p;
   2641 	for (i = 0; i < n; ++i) {
   2642 		opt_state->blocks[i]->dom = p;
   2643 		p += opt_state->nodewords;
   2644 	}
   2645 	opt_state->all_closure_sets = p;
   2646 	for (i = 0; i < n; ++i) {
   2647 		opt_state->blocks[i]->closure = p;
   2648 		p += opt_state->nodewords;
   2649 	}
   2650 	opt_state->all_edge_sets = p;
   2651 	for (i = 0; i < n; ++i) {
   2652 		register struct block *b = opt_state->blocks[i];
   2653 
   2654 		b->et.edom = p;
   2655 		p += opt_state->edgewords;
   2656 		b->ef.edom = p;
   2657 		p += opt_state->edgewords;
   2658 		b->et.id = i;
   2659 		opt_state->edges[i] = &b->et;
   2660 		b->ef.id = opt_state->n_blocks + i;
   2661 		opt_state->edges[opt_state->n_blocks + i] = &b->ef;
   2662 		b->et.pred = b;
   2663 		b->ef.pred = b;
   2664 	}
   2665 	max_stmts = 0;
   2666 	for (i = 0; i < n; ++i)
   2667 		max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
   2668 	/*
   2669 	 * We allocate at most 3 value numbers per statement,
   2670 	 * so this is an upper bound on the number of valnodes
   2671 	 * we'll need.
   2672 	 */
   2673 	opt_state->maxval = 3 * max_stmts;
   2674 	opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
   2675 	if (opt_state->vmap == NULL) {
   2676 		opt_error(opt_state, "malloc");
   2677 	}
   2678 	opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
   2679 	if (opt_state->vnode_base == NULL) {
   2680 		opt_error(opt_state, "malloc");
   2681 	}
   2682 }
   2683 
   2684 /*
   2685  * This is only used when supporting optimizer debugging.  It is
   2686  * global state, so do *not* do more than one compile in parallel
   2687  * and expect it to provide meaningful information.
   2688  */
   2689 #ifdef BDEBUG
   2690 int bids[NBIDS];
   2691 #endif
   2692 
   2693 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
   2694     PCAP_PRINTFLIKE(2, 3);
   2695 
   2696 /*
   2697  * Returns true if successful.  Returns false if a branch has
   2698  * an offset that is too large.  If so, we have marked that
   2699  * branch so that on a subsequent iteration, it will be treated
   2700  * properly.
   2701  */
   2702 static int
   2703 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
   2704 {
   2705 	struct bpf_insn *dst;
   2706 	struct slist *src;
   2707 	u_int slen;
   2708 	u_int off;
   2709 	struct slist **offset = NULL;
   2710 
   2711 	if (p == 0 || isMarked(ic, p))
   2712 		return (1);
   2713 	Mark(ic, p);
   2714 
   2715 	if (convert_code_r(conv_state, ic, JF(p)) == 0)
   2716 		return (0);
   2717 	if (convert_code_r(conv_state, ic, JT(p)) == 0)
   2718 		return (0);
   2719 
   2720 	slen = slength(p->stmts);
   2721 	dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
   2722 		/* inflate length by any extra jumps */
   2723 
   2724 	p->offset = (int)(dst - conv_state->fstart);
   2725 
   2726 	/* generate offset[] for convenience  */
   2727 	if (slen) {
   2728 		offset = (struct slist **)calloc(slen, sizeof(struct slist *));
   2729 		if (!offset) {
   2730 			conv_error(conv_state, "not enough core");
   2731 			/*NOTREACHED*/
   2732 		}
   2733 	}
   2734 	src = p->stmts;
   2735 	for (off = 0; off < slen && src; off++) {
   2736 #if 0
   2737 		printf("off=%d src=%x\n", off, src);
   2738 #endif
   2739 		offset[off] = src;
   2740 		src = src->next;
   2741 	}
   2742 
   2743 	off = 0;
   2744 	for (src = p->stmts; src; src = src->next) {
   2745 		if (src->s.code == NOP)
   2746 			continue;
   2747 		dst->code = (u_short)src->s.code;
   2748 		dst->k = src->s.k;
   2749 
   2750 		/* fill block-local relative jump */
   2751 		if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
   2752 #if 0
   2753 			if (src->s.jt || src->s.jf) {
   2754 				free(offset);
   2755 				conv_error(conv_state, "illegal jmp destination");
   2756 				/*NOTREACHED*/
   2757 			}
   2758 #endif
   2759 			goto filled;
   2760 		}
   2761 		if (off == slen - 2)	/*???*/
   2762 			goto filled;
   2763 
   2764 	    {
   2765 		u_int i;
   2766 		int jt, jf;
   2767 		const char ljerr[] = "%s for block-local relative jump: off=%d";
   2768 
   2769 #if 0
   2770 		printf("code=%x off=%d %x %x\n", src->s.code,
   2771 			off, src->s.jt, src->s.jf);
   2772 #endif
   2773 
   2774 		if (!src->s.jt || !src->s.jf) {
   2775 			free(offset);
   2776 			conv_error(conv_state, ljerr, "no jmp destination", off);
   2777 			/*NOTREACHED*/
   2778 		}
   2779 
   2780 		jt = jf = 0;
   2781 		for (i = 0; i < slen; i++) {
   2782 			if (offset[i] == src->s.jt) {
   2783 				if (jt) {
   2784 					free(offset);
   2785 					conv_error(conv_state, ljerr, "multiple matches", off);
   2786 					/*NOTREACHED*/
   2787 				}
   2788 
   2789 				if (i - off - 1 >= 256) {
   2790 					free(offset);
   2791 					conv_error(conv_state, ljerr, "out-of-range jump", off);
   2792 					/*NOTREACHED*/
   2793 				}
   2794 				dst->jt = (u_char)(i - off - 1);
   2795 				jt++;
   2796 			}
   2797 			if (offset[i] == src->s.jf) {
   2798 				if (jf) {
   2799 					free(offset);
   2800 					conv_error(conv_state, ljerr, "multiple matches", off);
   2801 					/*NOTREACHED*/
   2802 				}
   2803 				if (i - off - 1 >= 256) {
   2804 					free(offset);
   2805 					conv_error(conv_state, ljerr, "out-of-range jump", off);
   2806 					/*NOTREACHED*/
   2807 				}
   2808 				dst->jf = (u_char)(i - off - 1);
   2809 				jf++;
   2810 			}
   2811 		}
   2812 		if (!jt || !jf) {
   2813 			free(offset);
   2814 			conv_error(conv_state, ljerr, "no destination found", off);
   2815 			/*NOTREACHED*/
   2816 		}
   2817 	    }
   2818 filled:
   2819 		++dst;
   2820 		++off;
   2821 	}
   2822 	if (offset)
   2823 		free(offset);
   2824 
   2825 #ifdef BDEBUG
   2826 	if (dst - conv_state->fstart < NBIDS)
   2827 		bids[dst - conv_state->fstart] = p->id + 1;
   2828 #endif
   2829 	dst->code = (u_short)p->s.code;
   2830 	dst->k = p->s.k;
   2831 	if (JT(p)) {
   2832 		/* number of extra jumps inserted */
   2833 		u_char extrajmps = 0;
   2834 		off = JT(p)->offset - (p->offset + slen) - 1;
   2835 		if (off >= 256) {
   2836 		    /* offset too large for branch, must add a jump */
   2837 		    if (p->longjt == 0) {
   2838 			/* mark this instruction and retry */
   2839 			p->longjt++;
   2840 			return(0);
   2841 		    }
   2842 		    dst->jt = extrajmps;
   2843 		    extrajmps++;
   2844 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
   2845 		    dst[extrajmps].k = off - extrajmps;
   2846 		}
   2847 		else
   2848 		    dst->jt = (u_char)off;
   2849 		off = JF(p)->offset - (p->offset + slen) - 1;
   2850 		if (off >= 256) {
   2851 		    /* offset too large for branch, must add a jump */
   2852 		    if (p->longjf == 0) {
   2853 			/* mark this instruction and retry */
   2854 			p->longjf++;
   2855 			return(0);
   2856 		    }
   2857 		    /* branch if F to following jump */
   2858 		    /* if two jumps are inserted, F goes to second one */
   2859 		    dst->jf = extrajmps;
   2860 		    extrajmps++;
   2861 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
   2862 		    dst[extrajmps].k = off - extrajmps;
   2863 		}
   2864 		else
   2865 		    dst->jf = (u_char)off;
   2866 	}
   2867 	return (1);
   2868 }
   2869 
   2870 
   2871 /*
   2872  * Convert flowgraph intermediate representation to the
   2873  * BPF array representation.  Set *lenp to the number of instructions.
   2874  *
   2875  * This routine does *NOT* leak the memory pointed to by fp.  It *must
   2876  * not* do free(fp) before returning fp; doing so would make no sense,
   2877  * as the BPF array pointed to by the return value of icode_to_fcode()
   2878  * must be valid - it's being returned for use in a bpf_program structure.
   2879  *
   2880  * If it appears that icode_to_fcode() is leaking, the problem is that
   2881  * the program using pcap_compile() is failing to free the memory in
   2882  * the BPF program when it's done - the leak is in the program, not in
   2883  * the routine that happens to be allocating the memory.  (By analogy, if
   2884  * a program calls fopen() without ever calling fclose() on the FILE *,
   2885  * it will leak the FILE structure; the leak is not in fopen(), it's in
   2886  * the program.)  Change the program to use pcap_freecode() when it's
   2887  * done with the filter program.  See the pcap man page.
   2888  */
   2889 struct bpf_insn *
   2890 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
   2891     char *errbuf)
   2892 {
   2893 	u_int n;
   2894 	struct bpf_insn *fp;
   2895 	conv_state_t conv_state;
   2896 
   2897 	conv_state.fstart = NULL;
   2898 	conv_state.errbuf = errbuf;
   2899 	if (setjmp(conv_state.top_ctx) != 0) {
   2900 		free(conv_state.fstart);
   2901 		return NULL;
   2902 	}
   2903 
   2904 	/*
   2905 	 * Loop doing convert_code_r() until no branches remain
   2906 	 * with too-large offsets.
   2907 	 */
   2908 	for (;;) {
   2909 	    unMarkAll(ic);
   2910 	    n = *lenp = count_stmts(ic, root);
   2911 
   2912 	    fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
   2913 	    if (fp == NULL) {
   2914 		(void)snprintf(errbuf, PCAP_ERRBUF_SIZE,
   2915 		    "malloc");
   2916 		return NULL;
   2917 	    }
   2918 	    memset((char *)fp, 0, sizeof(*fp) * n);
   2919 	    conv_state.fstart = fp;
   2920 	    conv_state.ftail = fp + n;
   2921 
   2922 	    unMarkAll(ic);
   2923 	    if (convert_code_r(&conv_state, ic, root))
   2924 		break;
   2925 	    free(fp);
   2926 	}
   2927 
   2928 	return fp;
   2929 }
   2930 
   2931 /*
   2932  * For iconv_to_fconv() errors.
   2933  */
   2934 static void PCAP_NORETURN
   2935 conv_error(conv_state_t *conv_state, const char *fmt, ...)
   2936 {
   2937 	va_list ap;
   2938 
   2939 	va_start(ap, fmt);
   2940 	(void)vsnprintf(conv_state->errbuf,
   2941 	    PCAP_ERRBUF_SIZE, fmt, ap);
   2942 	va_end(ap);
   2943 	longjmp(conv_state->top_ctx, 1);
   2944 	/* NOTREACHED */
   2945 #ifdef _AIX
   2946 	PCAP_UNREACHABLE
   2947 #endif /* _AIX */
   2948 }
   2949 
   2950 /*
   2951  * Make a copy of a BPF program and put it in the "fcode" member of
   2952  * a "pcap_t".
   2953  *
   2954  * If we fail to allocate memory for the copy, fill in the "errbuf"
   2955  * member of the "pcap_t" with an error message, and return -1;
   2956  * otherwise, return 0.
   2957  */
   2958 int
   2959 pcapint_install_bpf_program(pcap_t *p, struct bpf_program *fp)
   2960 {
   2961 	size_t prog_size;
   2962 
   2963 	/*
   2964 	 * Validate the program.
   2965 	 */
   2966 	if (!pcapint_validate_filter(fp->bf_insns, fp->bf_len)) {
   2967 		snprintf(p->errbuf, sizeof(p->errbuf),
   2968 			"BPF program is not valid");
   2969 		return (-1);
   2970 	}
   2971 
   2972 	/*
   2973 	 * Free up any already installed program.
   2974 	 */
   2975 	pcap_freecode(&p->fcode);
   2976 
   2977 	prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
   2978 	p->fcode.bf_len = fp->bf_len;
   2979 	p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
   2980 	if (p->fcode.bf_insns == NULL) {
   2981 		pcapint_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
   2982 		    errno, "malloc");
   2983 		return (-1);
   2984 	}
   2985 	memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
   2986 	return (0);
   2987 }
   2988 
   2989 #ifdef BDEBUG
   2990 static void
   2991 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
   2992     FILE *out)
   2993 {
   2994 	int icount, noffset;
   2995 	int i;
   2996 
   2997 	if (block == NULL || isMarked(ic, block))
   2998 		return;
   2999 	Mark(ic, block);
   3000 
   3001 	icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
   3002 	noffset = min(block->offset + icount, (int)prog->bf_len);
   3003 
   3004 	fprintf(out, "\tblock%u [shape=ellipse, id=\"block-%u\" label=\"BLOCK%u\\n", block->id, block->id, block->id);
   3005 	for (i = block->offset; i < noffset; i++) {
   3006 		fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
   3007 	}
   3008 	fprintf(out, "\" tooltip=\"");
   3009 	for (i = 0; i < BPF_MEMWORDS; i++)
   3010 		if (block->val[i] != VAL_UNKNOWN)
   3011 			fprintf(out, "val[%d]=%d ", i, block->val[i]);
   3012 	fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
   3013 	fprintf(out, "val[X]=%d", block->val[X_ATOM]);
   3014 	fprintf(out, "\"");
   3015 	if (JT(block) == NULL)
   3016 		fprintf(out, ", peripheries=2");
   3017 	fprintf(out, "];\n");
   3018 
   3019 	dot_dump_node(ic, JT(block), prog, out);
   3020 	dot_dump_node(ic, JF(block), prog, out);
   3021 }
   3022 
   3023 static void
   3024 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
   3025 {
   3026 	if (block == NULL || isMarked(ic, block))
   3027 		return;
   3028 	Mark(ic, block);
   3029 
   3030 	if (JT(block)) {
   3031 		fprintf(out, "\t\"block%u\":se -> \"block%u\":n [label=\"T\"]; \n",
   3032 				block->id, JT(block)->id);
   3033 		fprintf(out, "\t\"block%u\":sw -> \"block%u\":n [label=\"F\"]; \n",
   3034 			   block->id, JF(block)->id);
   3035 	}
   3036 	dot_dump_edge(ic, JT(block), out);
   3037 	dot_dump_edge(ic, JF(block), out);
   3038 }
   3039 
   3040 /* Output the block CFG using graphviz/DOT language
   3041  * In the CFG, block's code, value index for each registers at EXIT,
   3042  * and the jump relationship is show.
   3043  *
   3044  * example DOT for BPF `ip src host 1.1.1.1' is:
   3045     digraph BPF {
   3046 	block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2	jf 5" tooltip="val[A]=0 val[X]=0"];
   3047 	block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4	jf 5" tooltip="val[A]=0 val[X]=0"];
   3048 	block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
   3049 	block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
   3050 	"block0":se -> "block1":n [label="T"];
   3051 	"block0":sw -> "block3":n [label="F"];
   3052 	"block1":se -> "block2":n [label="T"];
   3053 	"block1":sw -> "block3":n [label="F"];
   3054     }
   3055  *
   3056  *  After install graphviz on https://www.graphviz.org/, save it as bpf.dot
   3057  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
   3058  */
   3059 static int
   3060 dot_dump(struct icode *ic, char *errbuf)
   3061 {
   3062 	struct bpf_program f;
   3063 	FILE *out = stdout;
   3064 
   3065 	memset(bids, 0, sizeof bids);
   3066 	f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
   3067 	if (f.bf_insns == NULL)
   3068 		return -1;
   3069 
   3070 	fprintf(out, "digraph BPF {\n");
   3071 	unMarkAll(ic);
   3072 	dot_dump_node(ic, ic->root, &f, out);
   3073 	unMarkAll(ic);
   3074 	dot_dump_edge(ic, ic->root, out);
   3075 	fprintf(out, "}\n");
   3076 
   3077 	free((char *)f.bf_insns);
   3078 	return 0;
   3079 }
   3080 
   3081 static int
   3082 plain_dump(struct icode *ic, char *errbuf)
   3083 {
   3084 	struct bpf_program f;
   3085 
   3086 	memset(bids, 0, sizeof bids);
   3087 	f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
   3088 	if (f.bf_insns == NULL)
   3089 		return -1;
   3090 	bpf_dump(&f, 1);
   3091 	putchar('\n');
   3092 	free((char *)f.bf_insns);
   3093 	return 0;
   3094 }
   3095 
   3096 static void
   3097 opt_dump(opt_state_t *opt_state, struct icode *ic)
   3098 {
   3099 	int status;
   3100 	char errbuf[PCAP_ERRBUF_SIZE];
   3101 
   3102 	/*
   3103 	 * If the CFG, in DOT format, is requested, output it rather than
   3104 	 * the code that would be generated from that graph.
   3105 	 */
   3106 	if (pcap_print_dot_graph)
   3107 		status = dot_dump(ic, errbuf);
   3108 	else
   3109 		status = plain_dump(ic, errbuf);
   3110 	if (status == -1)
   3111 		opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf);
   3112 }
   3113 #endif
   3114