unbound_munin_ revision 1.1.1.4 1 #!/bin/sh
2 #
3 # plugin for munin to monitor usage of unbound servers.
4 # To install copy this to /usr/local/share/munin/plugins/unbound_munin_
5 # and use munin-node-configure (--suggest, --shell).
6 #
7 # (C) 2008 W.C.A. Wijngaards. BSD Licensed.
8 #
9 # To install; enable statistics and unbound-control in unbound.conf
10 # server: extended-statistics: yes
11 # statistics-cumulative: no
12 # statistics-interval: 0
13 # remote-control: control-enable: yes
14 # Run the command unbound-control-setup to generate the key files.
15 #
16 # Environment variables for this script
17 # unbound_conf - where the unbound.conf file is located.
18 # unbound_control - where to find unbound-control executable.
19 # spoof_warn - what level to warn about spoofing
20 # spoof_crit - what level to crit about spoofing
21 #
22 # You can set them in your munin/plugin-conf.d/plugins.conf file
23 # with:
24 # [unbound*]
25 # user root
26 # env.unbound_conf /usr/local/etc/unbound/unbound.conf
27 # env.unbound_control /usr/local/sbin/unbound-control
28 # env.spoof_warn 1000
29 # env.spoof_crit 100000
30 #
31 # This plugin can create different graphs depending on what name
32 # you link it as (with ln -s) into the plugins directory
33 # You can link it multiple times.
34 # If you are only a casual user, the _hits and _by_type are most interesting,
35 # possibly followed by _by_rcode.
36 #
37 # unbound_munin_hits - base volume, cache hits, unwanted traffic
38 # unbound_munin_queue - to monitor the internal requestlist
39 # unbound_munin_memory - memory usage
40 # unbound_munin_by_type - incoming queries by type
41 # unbound_munin_by_class - incoming queries by class
42 # unbound_munin_by_opcode - incoming queries by opcode
43 # unbound_munin_by_rcode - answers by rcode, validation status
44 # unbound_munin_by_flags - incoming queries by flags
45 # unbound_munin_histogram - histogram of query resolving times
46 #
47 # Magic markers - optional - used by installation scripts and
48 # munin-config: (originally contrib family but munin-node-configure ignores it)
49 #
50 #%# family=auto
51 #%# capabilities=autoconf suggest
52
53 # POD documentation
54 : <<=cut
55 =head1 NAME
56
57 unbound_munin_ - Munin plugin to monitor the Unbound DNS resolver.
58
59 =head1 APPLICABLE SYSTEMS
60
61 System with unbound daemon.
62
63 =head1 CONFIGURATION
64
65 [unbound*]
66 user root
67 env.unbound_conf /usr/local/etc/unbound/unbound.conf
68 env.unbound_control /usr/local/sbin/unbound-control
69 env.spoof_warn 1000
70 env.spoof_crit 100000
71
72 Use the .env settings to override the defaults.
73
74 =head1 USAGE
75
76 Can be used to present different graphs. Use ln -s for that name in
77 the plugins directory to enable the graph.
78 unbound_munin_hits - base volume, cache hits, unwanted traffic
79 unbound_munin_queue - to monitor the internal requestlist
80 unbound_munin_memory - memory usage
81 unbound_munin_by_type - incoming queries by type
82 unbound_munin_by_class - incoming queries by class
83 unbound_munin_by_opcode - incoming queries by opcode
84 unbound_munin_by_rcode - answers by rcode, validation status
85 unbound_munin_by_flags - incoming queries by flags
86 unbound_munin_histogram - histogram of query resolving times
87
88 =head1 AUTHOR
89
90 Copyright 2008 W.C.A. Wijngaards
91
92 =head1 LICENSE
93
94 BSD
95
96 =cut
97
98 state="${MUNIN_PLUGSTATE}/unbound.state"
99 seentags="${MUNIN_PLUGSTATE}/unbound-seentags.state"
100 conf=${unbound_conf:-/usr/local/etc/unbound/unbound.conf}
101 ctrl=${unbound_control:-/usr/local/sbin/unbound-control}
102 warn=${spoof_warn:-1000}
103 crit=${spoof_crit:-100000}
104 lock=$state.lock
105
106 # number of seconds between polling attempts.
107 # makes the statefile hang around for at least this many seconds,
108 # so that multiple links of this script can share the results.
109 lee=55
110
111 # to keep things within 19 characters
112 ABBREV="-e s/total/t/ -e s/thread/t/ -e s/num/n/ -e s/query/q/ -e s/answer/a/ -e s/unwanted/u/ -e s/requestlist/ql/ -e s/type/t/ -e s/class/c/ -e s/opcode/o/ -e s/rcode/r/ -e s/edns/e/ -e s/mem/m/ -e s/cache/c/ -e s/mod/m/"
113
114 # get value from $1 into return variable $value
115 get_value ( ) {
116 value="`grep '^'$1'=' $state | sed -e 's/^.*=//'`"
117 if test "$value"x = ""x; then
118 value="0"
119 fi
120 }
121
122 # Update list of seen query types etc to seentags file. This is run while
123 # holding the lock, after the state file is updated.
124 update_seentags() {
125 tmplist="$(cat ${seentags} 2> /dev/null)
126 num.query.type.A
127 num.query.class.IN
128 num.query.opcode.QUERY
129 num.answer.rcode.NOERROR
130 "
131 (echo "${tmplist}"; grep ^num ${state} | sed -e 's/=.*//') | sort -u > ${seentags}
132 }
133
134 # download the state from the unbound server.
135 get_state ( ) {
136 # obtain lock for fetching the state
137 # because there is a race condition in fetching and writing to file
138
139 # see if the lock is stale, if so, take it
140 if test -f $lock ; then
141 pid="`cat $lock 2>&1`"
142 kill -0 "$pid" >/dev/null 2>&1
143 if test $? -ne 0 -a "$pid" != $$ ; then
144 echo $$ >$lock
145 fi
146 fi
147
148 i=0
149 while test ! -f $lock || test "`cat $lock 2>&1`" != $$; do
150 while test -f $lock; do
151 # wait
152 i=`expr $i + 1`
153 if test $i -gt 1000; then
154 sleep 1;
155 fi
156 if test $i -gt 1500; then
157 echo "error locking $lock" "=" `cat $lock`
158 rm -f $lock
159 exit 1
160 fi
161 done
162 # try to get it
163 if echo $$ >$lock ; then : ; else break; fi
164 done
165 # do not refetch if the file exists and only LEE seconds old
166 if test -f $state; then
167 now=`date +%s`
168 get_value "time.now"
169 value="`echo $value | sed -e 's/\..*$//'`"
170 if test $now -lt `expr $value + $lee`; then
171 rm -f $lock
172 return
173 fi
174 fi
175 $ctrl -c $conf stats > $state
176 if test $? -ne 0; then
177 echo "error retrieving data from unbound server"
178 rm -f $lock
179 exit 1
180 fi
181 update_seentags
182 rm -f $lock
183 }
184
185 if test "$1" = "autoconf" ; then
186 if test ! -f $conf; then
187 echo no "($conf does not exist)"
188 exit 0
189 fi
190 if test ! -d `dirname $state`; then
191 echo no "(`dirname $state` directory does not exist)"
192 exit 0
193 fi
194 echo yes
195 exit 0
196 fi
197
198 if test "$1" = "suggest" ; then
199 echo "hits"
200 echo "queue"
201 echo "memory"
202 echo "by_type"
203 echo "by_class"
204 echo "by_opcode"
205 echo "by_rcode"
206 echo "by_flags"
207 echo "histogram"
208 exit 0
209 fi
210
211 # determine my type, by name
212 id=`echo $0 | sed -e 's/^.*unbound_munin_//'`
213 if test "$id"x = ""x; then
214 # some default to keep people sane.
215 id="hits"
216 fi
217
218 # if $1 exists in statefile, config is echoed with label $2
219 exist_config ( ) {
220 mn=`echo $1 | sed $ABBREV | tr . _`
221 if grep '^'$1'=' $state >/dev/null 2>&1; then
222 echo "$mn.label $2"
223 echo "$mn.min 0"
224 echo "$mn.type ABSOLUTE"
225 fi
226 }
227
228 # print label and min 0 for a name $1 in unbound format
229 p_config ( ) {
230 mn=`echo $1 | sed $ABBREV | tr . _`
231 echo $mn.label "$2"
232 echo $mn.min 0
233 echo $mn.type $3
234 }
235
236 if test "$1" = "config" ; then
237 if test ! -f $state; then
238 get_state
239 fi
240 case $id in
241 hits)
242 echo "graph_title Unbound DNS traffic and cache hits"
243 echo "graph_args --base 1000 -l 0"
244 echo "graph_vlabel queries / \${graph_period}"
245 echo "graph_scale no"
246 echo "graph_category dns"
247 for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
248 sed -e 's/=.*//'`; do
249 exist_config $x "queries handled by `basename $x .num.queries`"
250 done
251 p_config "total.num.queries" "total queries from clients" "ABSOLUTE"
252 p_config "total.num.cachehits" "cache hits" "ABSOLUTE"
253 p_config "total.num.prefetch" "cache prefetch" "ABSOLUTE"
254 p_config "num.query.tcp" "TCP queries" "ABSOLUTE"
255 p_config "num.query.tcpout" "TCP out queries" "ABSOLUTE"
256 p_config "num.query.udpout" "UDP out queries" "ABSOLUTE"
257 p_config "num.query.tls" "TLS queries" "ABSOLUTE"
258 p_config "num.query.tls.resume" "TLS resumes" "ABSOLUTE"
259 p_config "num.query.ipv6" "IPv6 queries" "ABSOLUTE"
260 p_config "unwanted.queries" "queries that failed acl" "ABSOLUTE"
261 p_config "unwanted.replies" "unwanted or unsolicited replies" "ABSOLUTE"
262 echo "u_replies.warning $warn"
263 echo "u_replies.critical $crit"
264 echo "graph_info DNS queries to the recursive resolver. The unwanted replies could be innocent duplicate packets, late replies, or spoof threats."
265 ;;
266 queue)
267 echo "graph_title Unbound requestlist size"
268 echo "graph_args --base 1000 -l 0"
269 echo "graph_vlabel number of queries"
270 echo "graph_scale no"
271 echo "graph_category dns"
272 p_config "total.requestlist.avg" "Average size of queue on insert" "GAUGE"
273 p_config "total.requestlist.max" "Max size of queue (in 5 min)" "GAUGE"
274 p_config "total.requestlist.overwritten" "Number of queries replaced by new ones" "GAUGE"
275 p_config "total.requestlist.exceeded" "Number of queries dropped due to lack of space" "GAUGE"
276 echo "graph_info The queries that did not hit the cache and need recursion service take up space in the requestlist. If there are too many queries, first queries get overwritten, and at last resort dropped."
277 ;;
278 memory)
279 echo "graph_title Unbound memory usage"
280 echo "graph_args --base 1024 -l 0"
281 echo "graph_vlabel memory used in bytes"
282 echo "graph_category dns"
283 p_config "mem.cache.rrset" "RRset cache memory" "GAUGE"
284 p_config "mem.cache.message" "Message cache memory" "GAUGE"
285 p_config "mem.mod.iterator" "Iterator module memory" "GAUGE"
286 p_config "mem.mod.validator" "Validator module and key cache memory" "GAUGE"
287 p_config "msg.cache.count" "msg cache count" "GAUGE"
288 p_config "rrset.cache.count" "rrset cache count" "GAUGE"
289 p_config "infra.cache.count" "infra cache count" "GAUGE"
290 p_config "key.cache.count" "key cache count" "GAUGE"
291 echo "graph_info The memory used by unbound."
292 ;;
293 by_type)
294 echo "graph_title Unbound DNS queries by type"
295 echo "graph_args --base 1000 -l 0"
296 echo "graph_vlabel queries / \${graph_period}"
297 echo "graph_scale no"
298 echo "graph_category dns"
299 for nm in `grep "^num.query.type" $seentags`; do
300 tp=`echo $nm | sed -e s/num.query.type.//`
301 p_config "$nm" "$tp" "ABSOLUTE"
302 done
303 echo "graph_info queries by DNS RR type queried for"
304 ;;
305 by_class)
306 echo "graph_title Unbound DNS queries by class"
307 echo "graph_args --base 1000 -l 0"
308 echo "graph_vlabel queries / \${graph_period}"
309 echo "graph_scale no"
310 echo "graph_category dns"
311 for nm in `grep "^num.query.class" $seentags`; do
312 tp=`echo $nm | sed -e s/num.query.class.//`
313 p_config "$nm" "$tp" "ABSOLUTE"
314 done
315 echo "graph_info queries by DNS RR class queried for."
316 ;;
317 by_opcode)
318 echo "graph_title Unbound DNS queries by opcode"
319 echo "graph_args --base 1000 -l 0"
320 echo "graph_vlabel queries / \${graph_period}"
321 echo "graph_scale no"
322 echo "graph_category dns"
323 for nm in `grep "^num.query.opcode" $seentags`; do
324 tp=`echo $nm | sed -e s/num.query.opcode.//`
325 p_config "$nm" "$tp" "ABSOLUTE"
326 done
327 echo "graph_info queries by opcode in the query packet."
328 ;;
329 by_rcode)
330 echo "graph_title Unbound DNS answers by return code"
331 echo "graph_args --base 1000 -l 0"
332 echo "graph_vlabel answer packets / \${graph_period}"
333 echo "graph_scale no"
334 echo "graph_category dns"
335 for nm in `grep "^num.answer.rcode" $seentags`; do
336 tp=`echo $nm | sed -e s/num.answer.rcode.//`
337 p_config "$nm" "$tp" "ABSOLUTE"
338 done
339 p_config "num.answer.secure" "answer secure" "ABSOLUTE"
340 p_config "num.answer.bogus" "answer bogus" "ABSOLUTE"
341 p_config "num.rrset.bogus" "num rrsets marked bogus" "ABSOLUTE"
342 echo "graph_info answers sorted by return value. rrsets bogus is the number of rrsets marked bogus per \${graph_period} by the validator"
343 ;;
344 by_flags)
345 echo "graph_title Unbound DNS incoming queries by flags"
346 echo "graph_args --base 1000 -l 0"
347 echo "graph_vlabel queries / \${graph_period}"
348 echo "graph_scale no"
349 echo "graph_category dns"
350 p_config "num.query.flags.QR" "QR (query reply) flag" "ABSOLUTE"
351 p_config "num.query.flags.AA" "AA (auth answer) flag" "ABSOLUTE"
352 p_config "num.query.flags.TC" "TC (truncated) flag" "ABSOLUTE"
353 p_config "num.query.flags.RD" "RD (recursion desired) flag" "ABSOLUTE"
354 p_config "num.query.flags.RA" "RA (rec avail) flag" "ABSOLUTE"
355 p_config "num.query.flags.Z" "Z (zero) flag" "ABSOLUTE"
356 p_config "num.query.flags.AD" "AD (auth data) flag" "ABSOLUTE"
357 p_config "num.query.flags.CD" "CD (check disabled) flag" "ABSOLUTE"
358 p_config "num.query.edns.present" "EDNS OPT present" "ABSOLUTE"
359 p_config "num.query.edns.DO" "DO (DNSSEC OK) flag" "ABSOLUTE"
360 echo "graph_info This graphs plots the flags inside incoming queries. For example, if QR, AA, TC, RA, Z flags are set, the query can be rejected. RD, AD, CD and DO are legitimately set by some software."
361 ;;
362 histogram)
363 echo "graph_title Unbound DNS histogram of reply time"
364 echo "graph_args --base 1000 -l 0"
365 echo "graph_vlabel queries / \${graph_period}"
366 echo "graph_scale no"
367 echo "graph_category dns"
368 echo hcache.label "cache hits"
369 echo hcache.min 0
370 echo hcache.type ABSOLUTE
371 echo hcache.draw AREA
372 echo hcache.colour 999999
373 echo h64ms.label "0 msec - 66 msec"
374 echo h64ms.min 0
375 echo h64ms.type ABSOLUTE
376 echo h64ms.draw STACK
377 echo h64ms.colour 0000FF
378 echo h128ms.label "66 msec - 131 msec"
379 echo h128ms.min 0
380 echo h128ms.type ABSOLUTE
381 echo h128ms.colour 1F00DF
382 echo h128ms.draw STACK
383 echo h256ms.label "131 msec - 262 msec"
384 echo h256ms.min 0
385 echo h256ms.type ABSOLUTE
386 echo h256ms.draw STACK
387 echo h256ms.colour 3F00BF
388 echo h512ms.label "262 msec - 524 msec"
389 echo h512ms.min 0
390 echo h512ms.type ABSOLUTE
391 echo h512ms.draw STACK
392 echo h512ms.colour 5F009F
393 echo h1s.label "524 msec - 1 sec"
394 echo h1s.min 0
395 echo h1s.type ABSOLUTE
396 echo h1s.draw STACK
397 echo h1s.colour 7F007F
398 echo h2s.label "1 sec - 2 sec"
399 echo h2s.min 0
400 echo h2s.type ABSOLUTE
401 echo h2s.draw STACK
402 echo h2s.colour 9F005F
403 echo h4s.label "2 sec - 4 sec"
404 echo h4s.min 0
405 echo h4s.type ABSOLUTE
406 echo h4s.draw STACK
407 echo h4s.colour BF003F
408 echo h8s.label "4 sec - 8 sec"
409 echo h8s.min 0
410 echo h8s.type ABSOLUTE
411 echo h8s.draw STACK
412 echo h8s.colour DF001F
413 echo h16s.label "8 sec - ..."
414 echo h16s.min 0
415 echo h16s.type ABSOLUTE
416 echo h16s.draw STACK
417 echo h16s.colour FF0000
418 echo "graph_info Histogram of the reply times for queries."
419 ;;
420 esac
421
422 exit 0
423 fi
424
425 # do the stats itself
426 get_state
427
428 # get the time elapsed
429 get_value "time.elapsed"
430 if test $value = 0 || test $value = "0.000000"; then
431 echo "error: time elapsed 0 or could not retrieve data"
432 exit 1
433 fi
434 elapsed="$value"
435
436 # print value for $1
437 print_value ( ) {
438 mn=`echo $1 | sed $ABBREV | tr . _`
439 get_value $1
440 echo "$mn.value" $value
441 }
442
443 # print value if line already found in $2
444 print_value_line ( ) {
445 mn=`echo $1 | sed $ABBREV | tr . _`
446 value="`echo $2 | sed -e 's/^.*=//'`"
447 echo "$mn.value" $value
448 }
449
450
451 case $id in
452 hits)
453 for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
454 sed -e 's/=.*//'` total.num.queries \
455 total.num.cachehits total.num.prefetch num.query.tcp \
456 num.query.tcpout num.query.udpout num.query.tls num.query.tls.resume \
457 num.query.ipv6 unwanted.queries \
458 unwanted.replies; do
459 if grep "^"$x"=" $state >/dev/null 2>&1; then
460 print_value $x
461 fi
462 done
463 ;;
464 queue)
465 for x in total.requestlist.avg total.requestlist.max \
466 total.requestlist.overwritten total.requestlist.exceeded; do
467 print_value $x
468 done
469 ;;
470 memory)
471 for x in mem.cache.rrset mem.cache.message mem.mod.iterator \
472 mem.mod.validator msg.cache.count rrset.cache.count \
473 infra.cache.count key.cache.count; do
474 print_value $x
475 done
476 ;;
477 by_type)
478 for nm in `grep "^num.query.type" $seentags`; do
479 print_value $nm
480 done
481 ;;
482 by_class)
483 for nm in `grep "^num.query.class" $seentags`; do
484 print_value $nm
485 done
486 ;;
487 by_opcode)
488 for nm in `grep "^num.query.opcode" $seentags`; do
489 print_value $nm
490 done
491 ;;
492 by_rcode)
493 for nm in `grep "^num.answer.rcode" $seentags`; do
494 print_value $nm
495 done
496 print_value "num.answer.secure"
497 print_value "num.answer.bogus"
498 print_value "num.rrset.bogus"
499 ;;
500 by_flags)
501 for x in num.query.flags.QR num.query.flags.AA num.query.flags.TC num.query.flags.RD num.query.flags.RA num.query.flags.Z num.query.flags.AD num.query.flags.CD num.query.edns.present num.query.edns.DO; do
502 print_value $x
503 done
504 ;;
505 histogram)
506 get_value total.num.cachehits
507 echo hcache.value $value
508 r=0
509 for x in histogram.000000.000000.to.000000.000001 \
510 histogram.000000.000001.to.000000.000002 \
511 histogram.000000.000002.to.000000.000004 \
512 histogram.000000.000004.to.000000.000008 \
513 histogram.000000.000008.to.000000.000016 \
514 histogram.000000.000016.to.000000.000032 \
515 histogram.000000.000032.to.000000.000064 \
516 histogram.000000.000064.to.000000.000128 \
517 histogram.000000.000128.to.000000.000256 \
518 histogram.000000.000256.to.000000.000512 \
519 histogram.000000.000512.to.000000.001024 \
520 histogram.000000.001024.to.000000.002048 \
521 histogram.000000.002048.to.000000.004096 \
522 histogram.000000.004096.to.000000.008192 \
523 histogram.000000.008192.to.000000.016384 \
524 histogram.000000.016384.to.000000.032768 \
525 histogram.000000.032768.to.000000.065536; do
526 get_value $x
527 r=`expr $r + $value`
528 done
529 echo h64ms.value $r
530 get_value histogram.000000.065536.to.000000.131072
531 echo h128ms.value $value
532 get_value histogram.000000.131072.to.000000.262144
533 echo h256ms.value $value
534 get_value histogram.000000.262144.to.000000.524288
535 echo h512ms.value $value
536 get_value histogram.000000.524288.to.000001.000000
537 echo h1s.value $value
538 get_value histogram.000001.000000.to.000002.000000
539 echo h2s.value $value
540 get_value histogram.000002.000000.to.000004.000000
541 echo h4s.value $value
542 get_value histogram.000004.000000.to.000008.000000
543 echo h8s.value $value
544 r=0
545 for x in histogram.000008.000000.to.000016.000000 \
546 histogram.000016.000000.to.000032.000000 \
547 histogram.000032.000000.to.000064.000000 \
548 histogram.000064.000000.to.000128.000000 \
549 histogram.000128.000000.to.000256.000000 \
550 histogram.000256.000000.to.000512.000000 \
551 histogram.000512.000000.to.001024.000000 \
552 histogram.001024.000000.to.002048.000000 \
553 histogram.002048.000000.to.004096.000000 \
554 histogram.004096.000000.to.008192.000000 \
555 histogram.008192.000000.to.016384.000000 \
556 histogram.016384.000000.to.032768.000000 \
557 histogram.032768.000000.to.065536.000000 \
558 histogram.065536.000000.to.131072.000000 \
559 histogram.131072.000000.to.262144.000000 \
560 histogram.262144.000000.to.524288.000000; do
561 get_value $x
562 r=`expr $r + $value`
563 done
564 echo h16s.value $r
565 ;;
566 esac
567