unbound_munin_ revision 1.1 1 #!/bin/sh
2 #
3 # plugin for munin to monitor usage of unbound servers.
4 # To install copy this to /usr/local/share/munin/plugins/unbound_munin_
5 # and use munin-node-configure (--suggest, --shell).
6 #
7 # (C) 2008 W.C.A. Wijngaards. BSD Licensed.
8 #
9 # To install; enable statistics and unbound-control in unbound.conf
10 # server: extended-statistics: yes
11 # statistics-cumulative: no
12 # statistics-interval: 0
13 # remote-control: control-enable: yes
14 # Run the command unbound-control-setup to generate the key files.
15 #
16 # Environment variables for this script
17 # statefile - where to put temporary statefile.
18 # unbound_conf - where the unbound.conf file is located.
19 # unbound_control - where to find unbound-control executable.
20 # spoof_warn - what level to warn about spoofing
21 # spoof_crit - what level to crit about spoofing
22 #
23 # You can set them in your munin/plugin-conf.d/plugins.conf file
24 # with:
25 # [unbound*]
26 # user root
27 # env.statefile /usr/local/var/munin/plugin-state/unbound-state
28 # env.unbound_conf /usr/local/etc/unbound/unbound.conf
29 # env.unbound_control /usr/local/sbin/unbound-control
30 # env.spoof_warn 1000
31 # env.spoof_crit 100000
32 #
33 # This plugin can create different graphs depending on what name
34 # you link it as (with ln -s) into the plugins directory
35 # You can link it multiple times.
36 # If you are only a casual user, the _hits and _by_type are most interesting,
37 # possibly followed by _by_rcode.
38 #
39 # unbound_munin_hits - base volume, cache hits, unwanted traffic
40 # unbound_munin_queue - to monitor the internal requestlist
41 # unbound_munin_memory - memory usage
42 # unbound_munin_by_type - incoming queries by type
43 # unbound_munin_by_class - incoming queries by class
44 # unbound_munin_by_opcode - incoming queries by opcode
45 # unbound_munin_by_rcode - answers by rcode, validation status
46 # unbound_munin_by_flags - incoming queries by flags
47 # unbound_munin_histogram - histogram of query resolving times
48 #
49 # Magic markers - optional - used by installation scripts and
50 # munin-config: (originally contrib family but munin-node-configure ignores it)
51 #
52 #%# family=auto
53 #%# capabilities=autoconf suggest
54
55 # POD documentation
56 : <<=cut
57 =head1 NAME
58
59 unbound_munin_ - Munin plugin to monitor the Unbound DNS resolver.
60
61 =head1 APPLICABLE SYSTEMS
62
63 System with unbound daemon.
64
65 =head1 CONFIGURATION
66
67 [unbound*]
68 user root
69 env.statefile /usr/local/var/munin/plugin-state/unbound-state
70 env.unbound_conf /usr/local/etc/unbound/unbound.conf
71 env.unbound_control /usr/local/sbin/unbound-control
72 env.spoof_warn 1000
73 env.spoof_crit 100000
74
75 Use the .env settings to override the defaults.
76
77 =head1 USAGE
78
79 Can be used to present different graphs. Use ln -s for that name in
80 the plugins directory to enable the graph.
81 unbound_munin_hits - base volume, cache hits, unwanted traffic
82 unbound_munin_queue - to monitor the internal requestlist
83 unbound_munin_memory - memory usage
84 unbound_munin_by_type - incoming queries by type
85 unbound_munin_by_class - incoming queries by class
86 unbound_munin_by_opcode - incoming queries by opcode
87 unbound_munin_by_rcode - answers by rcode, validation status
88 unbound_munin_by_flags - incoming queries by flags
89 unbound_munin_histogram - histogram of query resolving times
90
91 =head1 AUTHOR
92
93 Copyright 2008 W.C.A. Wijngaards
94
95 =head1 LICENSE
96
97 BSD
98
99 =cut
100
101 state=${statefile:-/usr/local/var/munin/plugin-state/unbound-state}
102 conf=${unbound_conf:-/usr/local/etc/unbound/unbound.conf}
103 ctrl=${unbound_control:-/usr/local/sbin/unbound-control}
104 warn=${spoof_warn:-1000}
105 crit=${spoof_crit:-100000}
106 lock=$state.lock
107
108 # number of seconds between polling attempts.
109 # makes the statefile hang around for at least this many seconds,
110 # so that multiple links of this script can share the results.
111 lee=55
112
113 # to keep things within 19 characters
114 ABBREV="-e s/total/t/ -e s/thread/t/ -e s/num/n/ -e s/query/q/ -e s/answer/a/ -e s/unwanted/u/ -e s/requestlist/ql/ -e s/type/t/ -e s/class/c/ -e s/opcode/o/ -e s/rcode/r/ -e s/edns/e/ -e s/mem/m/ -e s/cache/c/ -e s/mod/m/"
115
116 # get value from $1 into return variable $value
117 get_value ( ) {
118 value="`grep '^'$1'=' $state | sed -e 's/^.*=//'`"
119 if test "$value"x = ""x; then
120 value="0"
121 fi
122 }
123
124 # download the state from the unbound server.
125 get_state ( ) {
126 # obtain lock for fetching the state
127 # because there is a race condition in fetching and writing to file
128
129 # see if the lock is stale, if so, take it
130 if test -f $lock ; then
131 pid="`cat $lock 2>&1`"
132 kill -0 "$pid" >/dev/null 2>&1
133 if test $? -ne 0 -a "$pid" != $$ ; then
134 echo $$ >$lock
135 fi
136 fi
137
138 i=0
139 while test ! -f $lock || test "`cat $lock 2>&1`" != $$; do
140 while test -f $lock; do
141 # wait
142 i=`expr $i + 1`
143 if test $i -gt 1000; then
144 sleep 1;
145 fi
146 if test $i -gt 1500; then
147 echo "error locking $lock" "=" `cat $lock`
148 rm -f $lock
149 exit 1
150 fi
151 done
152 # try to get it
153 echo $$ >$lock
154 done
155 # do not refetch if the file exists and only LEE seconds old
156 if test -f $state; then
157 now=`date +%s`
158 get_value "time.now"
159 value="`echo $value | sed -e 's/\..*$//'`"
160 if test $now -lt `expr $value + $lee`; then
161 rm -f $lock
162 return
163 fi
164 fi
165 $ctrl -c $conf stats > $state
166 if test $? -ne 0; then
167 echo "error retrieving data from unbound server"
168 rm -f $lock
169 exit 1
170 fi
171 rm -f $lock
172 }
173
174 if test "$1" = "autoconf" ; then
175 if test ! -f $conf; then
176 echo no "($conf does not exist)"
177 exit 1
178 fi
179 if test ! -d `dirname $state`; then
180 echo no "(`dirname $state` directory does not exist)"
181 exit 1
182 fi
183 echo yes
184 exit 0
185 fi
186
187 if test "$1" = "suggest" ; then
188 echo "hits"
189 echo "queue"
190 echo "memory"
191 echo "by_type"
192 echo "by_class"
193 echo "by_opcode"
194 echo "by_rcode"
195 echo "by_flags"
196 echo "histogram"
197 exit 0
198 fi
199
200 # determine my type, by name
201 id=`echo $0 | sed -e 's/^.*unbound_munin_//'`
202 if test "$id"x = ""x; then
203 # some default to keep people sane.
204 id="hits"
205 fi
206
207 # if $1 exists in statefile, config is echoed with label $2
208 exist_config ( ) {
209 mn=`echo $1 | sed $ABBREV | tr . _`
210 if grep '^'$1'=' $state >/dev/null 2>&1; then
211 echo "$mn.label $2"
212 echo "$mn.min 0"
213 echo "$mn.type ABSOLUTE"
214 fi
215 }
216
217 # print label and min 0 for a name $1 in unbound format
218 p_config ( ) {
219 mn=`echo $1 | sed $ABBREV | tr . _`
220 echo $mn.label "$2"
221 echo $mn.min 0
222 echo $mn.type $3
223 }
224
225 if test "$1" = "config" ; then
226 if test ! -f $state; then
227 get_state
228 fi
229 case $id in
230 hits)
231 echo "graph_title Unbound DNS traffic and cache hits"
232 echo "graph_args --base 1000 -l 0"
233 echo "graph_vlabel queries / \${graph_period}"
234 echo "graph_scale no"
235 echo "graph_category DNS"
236 for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
237 sed -e 's/=.*//'`; do
238 exist_config $x "queries handled by `basename $x .num.queries`"
239 done
240 p_config "total.num.queries" "total queries from clients" "ABSOLUTE"
241 p_config "total.num.cachehits" "cache hits" "ABSOLUTE"
242 p_config "total.num.prefetch" "cache prefetch" "ABSOLUTE"
243 p_config "num.query.tcp" "TCP queries" "ABSOLUTE"
244 p_config "num.query.tcpout" "TCP out queries" "ABSOLUTE"
245 p_config "num.query.ipv6" "IPv6 queries" "ABSOLUTE"
246 p_config "unwanted.queries" "queries that failed acl" "ABSOLUTE"
247 p_config "unwanted.replies" "unwanted or unsolicited replies" "ABSOLUTE"
248 echo "u_replies.warning $warn"
249 echo "u_replies.critical $crit"
250 echo "graph_info DNS queries to the recursive resolver. The unwanted replies could be innocent duplicate packets, late replies, or spoof threats."
251 ;;
252 queue)
253 echo "graph_title Unbound requestlist size"
254 echo "graph_args --base 1000 -l 0"
255 echo "graph_vlabel number of queries"
256 echo "graph_scale no"
257 echo "graph_category DNS"
258 p_config "total.requestlist.avg" "Average size of queue on insert" "GAUGE"
259 p_config "total.requestlist.max" "Max size of queue (in 5 min)" "GAUGE"
260 p_config "total.requestlist.overwritten" "Number of queries replaced by new ones" "GAUGE"
261 p_config "total.requestlist.exceeded" "Number of queries dropped due to lack of space" "GAUGE"
262 echo "graph_info The queries that did not hit the cache and need recursion service take up space in the requestlist. If there are too many queries, first queries get overwritten, and at last resort dropped."
263 ;;
264 memory)
265 echo "graph_title Unbound memory usage"
266 echo "graph_args --base 1024 -l 0"
267 echo "graph_vlabel memory used in bytes"
268 echo "graph_category DNS"
269 p_config "mem.total.sbrk" "Total memory" "GAUGE"
270 p_config "mem.cache.rrset" "RRset cache memory" "GAUGE"
271 p_config "mem.cache.message" "Message cache memory" "GAUGE"
272 p_config "mem.mod.iterator" "Iterator module memory" "GAUGE"
273 p_config "mem.mod.validator" "Validator module and key cache memory" "GAUGE"
274 p_config "msg.cache.count" "msg cache count" "GAUGE"
275 p_config "rrset.cache.count" "rrset cache count" "GAUGE"
276 p_config "infra.cache.count" "infra cache count" "GAUGE"
277 p_config "key.cache.count" "key cache count" "GAUGE"
278 echo "graph_info The memory used by unbound."
279 ;;
280 by_type)
281 echo "graph_title Unbound DNS queries by type"
282 echo "graph_args --base 1000 -l 0"
283 echo "graph_vlabel queries / \${graph_period}"
284 echo "graph_scale no"
285 echo "graph_category DNS"
286 for x in `grep "^num.query.type" $state`; do
287 nm=`echo $x | sed -e 's/=.*$//'`
288 tp=`echo $nm | sed -e s/num.query.type.//`
289 p_config "$nm" "$tp" "ABSOLUTE"
290 done
291 echo "graph_info queries by DNS RR type queried for"
292 ;;
293 by_class)
294 echo "graph_title Unbound DNS queries by class"
295 echo "graph_args --base 1000 -l 0"
296 echo "graph_vlabel queries / \${graph_period}"
297 echo "graph_scale no"
298 echo "graph_category DNS"
299 for x in `grep "^num.query.class" $state`; do
300 nm=`echo $x | sed -e 's/=.*$//'`
301 tp=`echo $nm | sed -e s/num.query.class.//`
302 p_config "$nm" "$tp" "ABSOLUTE"
303 done
304 echo "graph_info queries by DNS RR class queried for."
305 ;;
306 by_opcode)
307 echo "graph_title Unbound DNS queries by opcode"
308 echo "graph_args --base 1000 -l 0"
309 echo "graph_vlabel queries / \${graph_period}"
310 echo "graph_scale no"
311 echo "graph_category DNS"
312 for x in `grep "^num.query.opcode" $state`; do
313 nm=`echo $x | sed -e 's/=.*$//'`
314 tp=`echo $nm | sed -e s/num.query.opcode.//`
315 p_config "$nm" "$tp" "ABSOLUTE"
316 done
317 echo "graph_info queries by opcode in the query packet."
318 ;;
319 by_rcode)
320 echo "graph_title Unbound DNS answers by return code"
321 echo "graph_args --base 1000 -l 0"
322 echo "graph_vlabel answer packets / \${graph_period}"
323 echo "graph_scale no"
324 echo "graph_category DNS"
325 for x in `grep "^num.answer.rcode" $state`; do
326 nm=`echo $x | sed -e 's/=.*$//'`
327 tp=`echo $nm | sed -e s/num.answer.rcode.//`
328 p_config "$nm" "$tp" "ABSOLUTE"
329 done
330 p_config "num.answer.secure" "answer secure" "ABSOLUTE"
331 p_config "num.answer.bogus" "answer bogus" "ABSOLUTE"
332 p_config "num.rrset.bogus" "num rrsets marked bogus" "ABSOLUTE"
333 echo "graph_info answers sorted by return value. rrsets bogus is the number of rrsets marked bogus per \${graph_period} by the validator"
334 ;;
335 by_flags)
336 echo "graph_title Unbound DNS incoming queries by flags"
337 echo "graph_args --base 1000 -l 0"
338 echo "graph_vlabel queries / \${graph_period}"
339 echo "graph_scale no"
340 echo "graph_category DNS"
341 p_config "num.query.flags.QR" "QR (query reply) flag" "ABSOLUTE"
342 p_config "num.query.flags.AA" "AA (auth answer) flag" "ABSOLUTE"
343 p_config "num.query.flags.TC" "TC (truncated) flag" "ABSOLUTE"
344 p_config "num.query.flags.RD" "RD (recursion desired) flag" "ABSOLUTE"
345 p_config "num.query.flags.RA" "RA (rec avail) flag" "ABSOLUTE"
346 p_config "num.query.flags.Z" "Z (zero) flag" "ABSOLUTE"
347 p_config "num.query.flags.AD" "AD (auth data) flag" "ABSOLUTE"
348 p_config "num.query.flags.CD" "CD (check disabled) flag" "ABSOLUTE"
349 p_config "num.query.edns.present" "EDNS OPT present" "ABSOLUTE"
350 p_config "num.query.edns.DO" "DO (DNSSEC OK) flag" "ABSOLUTE"
351 echo "graph_info This graphs plots the flags inside incoming queries. For example, if QR, AA, TC, RA, Z flags are set, the query can be rejected. RD, AD, CD and DO are legitimately set by some software."
352 ;;
353 histogram)
354 echo "graph_title Unbound DNS histogram of reply time"
355 echo "graph_args --base 1000 -l 0"
356 echo "graph_vlabel queries / \${graph_period}"
357 echo "graph_scale no"
358 echo "graph_category DNS"
359 echo hcache.label "cache hits"
360 echo hcache.min 0
361 echo hcache.type ABSOLUTE
362 echo hcache.draw AREA
363 echo hcache.colour 999999
364 echo h64ms.label "0 msec - 66 msec"
365 echo h64ms.min 0
366 echo h64ms.type ABSOLUTE
367 echo h64ms.draw STACK
368 echo h64ms.colour 0000FF
369 echo h128ms.label "66 msec - 131 msec"
370 echo h128ms.min 0
371 echo h128ms.type ABSOLUTE
372 echo h128ms.colour 1F00DF
373 echo h128ms.draw STACK
374 echo h256ms.label "131 msec - 262 msec"
375 echo h256ms.min 0
376 echo h256ms.type ABSOLUTE
377 echo h256ms.draw STACK
378 echo h256ms.colour 3F00BF
379 echo h512ms.label "262 msec - 524 msec"
380 echo h512ms.min 0
381 echo h512ms.type ABSOLUTE
382 echo h512ms.draw STACK
383 echo h512ms.colour 5F009F
384 echo h1s.label "524 msec - 1 sec"
385 echo h1s.min 0
386 echo h1s.type ABSOLUTE
387 echo h1s.draw STACK
388 echo h1s.colour 7F007F
389 echo h2s.label "1 sec - 2 sec"
390 echo h2s.min 0
391 echo h2s.type ABSOLUTE
392 echo h2s.draw STACK
393 echo h2s.colour 9F005F
394 echo h4s.label "2 sec - 4 sec"
395 echo h4s.min 0
396 echo h4s.type ABSOLUTE
397 echo h4s.draw STACK
398 echo h4s.colour BF003F
399 echo h8s.label "4 sec - 8 sec"
400 echo h8s.min 0
401 echo h8s.type ABSOLUTE
402 echo h8s.draw STACK
403 echo h8s.colour DF001F
404 echo h16s.label "8 sec - ..."
405 echo h16s.min 0
406 echo h16s.type ABSOLUTE
407 echo h16s.draw STACK
408 echo h16s.colour FF0000
409 echo "graph_info Histogram of the reply times for queries."
410 ;;
411 esac
412
413 exit 0
414 fi
415
416 # do the stats itself
417 get_state
418
419 # get the time elapsed
420 get_value "time.elapsed"
421 if test $value = 0 || test $value = "0.000000"; then
422 echo "error: time elapsed 0 or could not retrieve data"
423 exit 1
424 fi
425 elapsed="$value"
426
427 # print value for $1
428 print_value ( ) {
429 mn=`echo $1 | sed $ABBREV | tr . _`
430 get_value $1
431 echo "$mn.value" $value
432 }
433
434 # print value if line already found in $2
435 print_value_line ( ) {
436 mn=`echo $1 | sed $ABBREV | tr . _`
437 value="`echo $2 | sed -e 's/^.*=//'`"
438 echo "$mn.value" $value
439 }
440
441
442 case $id in
443 hits)
444 for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
445 sed -e 's/=.*//'` total.num.queries \
446 total.num.cachehits total.num.prefetch num.query.tcp \
447 num.query.tcpout num.query.ipv6 unwanted.queries \
448 unwanted.replies; do
449 if grep "^"$x"=" $state >/dev/null 2>&1; then
450 print_value $x
451 fi
452 done
453 ;;
454 queue)
455 for x in total.requestlist.avg total.requestlist.max \
456 total.requestlist.overwritten total.requestlist.exceeded; do
457 print_value $x
458 done
459 ;;
460 memory)
461 mn=`echo mem.total.sbrk | sed $ABBREV | tr . _`
462 get_value 'mem.total.sbrk'
463 if test $value -eq 0; then
464 chk=`echo $ctrl | sed -e 's/-control$/-checkconf/'`
465 pidf=`$chk -o pidfile $conf 2>&1`
466 pid=`cat $pidf 2>&1`
467 value=`ps -p "$pid" -o rss= 2>&1`
468 if test "`expr $value + 1 - 1 2>&1`" -eq "$value" 2>&1; then
469 value=`expr $value \* 1024`
470 else
471 value=0
472 fi
473 fi
474 echo "$mn.value" $value
475 for x in mem.cache.rrset mem.cache.message mem.mod.iterator \
476 mem.mod.validator msg.cache.count rrset.cache.count \
477 infra.cache.count key.cache.count; do
478 print_value $x
479 done
480 ;;
481 by_type)
482 for x in `grep "^num.query.type" $state`; do
483 nm=`echo $x | sed -e 's/=.*$//'`
484 print_value_line $nm $x
485 done
486 ;;
487 by_class)
488 for x in `grep "^num.query.class" $state`; do
489 nm=`echo $x | sed -e 's/=.*$//'`
490 print_value_line $nm $x
491 done
492 ;;
493 by_opcode)
494 for x in `grep "^num.query.opcode" $state`; do
495 nm=`echo $x | sed -e 's/=.*$//'`
496 print_value_line $nm $x
497 done
498 ;;
499 by_rcode)
500 for x in `grep "^num.answer.rcode" $state`; do
501 nm=`echo $x | sed -e 's/=.*$//'`
502 print_value_line $nm $x
503 done
504 print_value "num.answer.secure"
505 print_value "num.answer.bogus"
506 print_value "num.rrset.bogus"
507 ;;
508 by_flags)
509 for x in num.query.flags.QR num.query.flags.AA num.query.flags.TC num.query.flags.RD num.query.flags.RA num.query.flags.Z num.query.flags.AD num.query.flags.CD num.query.edns.present num.query.edns.DO; do
510 print_value $x
511 done
512 ;;
513 histogram)
514 get_value total.num.cachehits
515 echo hcache.value $value
516 r=0
517 for x in histogram.000000.000000.to.000000.000001 \
518 histogram.000000.000001.to.000000.000002 \
519 histogram.000000.000002.to.000000.000004 \
520 histogram.000000.000004.to.000000.000008 \
521 histogram.000000.000008.to.000000.000016 \
522 histogram.000000.000016.to.000000.000032 \
523 histogram.000000.000032.to.000000.000064 \
524 histogram.000000.000064.to.000000.000128 \
525 histogram.000000.000128.to.000000.000256 \
526 histogram.000000.000256.to.000000.000512 \
527 histogram.000000.000512.to.000000.001024 \
528 histogram.000000.001024.to.000000.002048 \
529 histogram.000000.002048.to.000000.004096 \
530 histogram.000000.004096.to.000000.008192 \
531 histogram.000000.008192.to.000000.016384 \
532 histogram.000000.016384.to.000000.032768 \
533 histogram.000000.032768.to.000000.065536; do
534 get_value $x
535 r=`expr $r + $value`
536 done
537 echo h64ms.value $r
538 get_value histogram.000000.065536.to.000000.131072
539 echo h128ms.value $value
540 get_value histogram.000000.131072.to.000000.262144
541 echo h256ms.value $value
542 get_value histogram.000000.262144.to.000000.524288
543 echo h512ms.value $value
544 get_value histogram.000000.524288.to.000001.000000
545 echo h1s.value $value
546 get_value histogram.000001.000000.to.000002.000000
547 echo h2s.value $value
548 get_value histogram.000002.000000.to.000004.000000
549 echo h4s.value $value
550 get_value histogram.000004.000000.to.000008.000000
551 echo h8s.value $value
552 r=0
553 for x in histogram.000008.000000.to.000016.000000 \
554 histogram.000016.000000.to.000032.000000 \
555 histogram.000032.000000.to.000064.000000 \
556 histogram.000064.000000.to.000128.000000 \
557 histogram.000128.000000.to.000256.000000 \
558 histogram.000256.000000.to.000512.000000 \
559 histogram.000512.000000.to.001024.000000 \
560 histogram.001024.000000.to.002048.000000 \
561 histogram.002048.000000.to.004096.000000 \
562 histogram.004096.000000.to.008192.000000 \
563 histogram.008192.000000.to.016384.000000 \
564 histogram.016384.000000.to.032768.000000 \
565 histogram.032768.000000.to.065536.000000 \
566 histogram.065536.000000.to.131072.000000 \
567 histogram.131072.000000.to.262144.000000 \
568 histogram.262144.000000.to.524288.000000; do
569 get_value $x
570 r=`expr $r + $value`
571 done
572 echo h16s.value $r
573 ;;
574 esac
575