Home | History | Annotate | Line # | Download | only in sets
join.awk revision 1.5
      1  1.5    apb #	$NetBSD: join.awk,v 1.5 2014/10/23 14:19:33 apb Exp $
      2  1.1  lukem #
      3  1.1  lukem # Copyright (c) 2002 The NetBSD Foundation, Inc.
      4  1.1  lukem # All rights reserved.
      5  1.1  lukem #
      6  1.1  lukem # This code is derived from software contributed to The NetBSD Foundation
      7  1.1  lukem # by Luke Mewburn of Wasabi Systems.
      8  1.1  lukem #
      9  1.1  lukem # Redistribution and use in source and binary forms, with or without
     10  1.1  lukem # modification, are permitted provided that the following conditions
     11  1.1  lukem # are met:
     12  1.1  lukem # 1. Redistributions of source code must retain the above copyright
     13  1.1  lukem #    notice, this list of conditions and the following disclaimer.
     14  1.1  lukem # 2. Redistributions in binary form must reproduce the above copyright
     15  1.1  lukem #    notice, this list of conditions and the following disclaimer in the
     16  1.1  lukem #    documentation and/or other materials provided with the distribution.
     17  1.1  lukem #
     18  1.1  lukem # THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     19  1.1  lukem # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     20  1.1  lukem # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     21  1.1  lukem # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     22  1.1  lukem # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     23  1.1  lukem # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     24  1.1  lukem # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     25  1.1  lukem # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  1.1  lukem # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     27  1.1  lukem # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     28  1.1  lukem # POSSIBILITY OF SUCH DAMAGE.
     29  1.1  lukem #
     30  1.1  lukem # join.awk F1 F2
     31  1.1  lukem #	Similar to join(1), this reads a list of words from F1
     32  1.1  lukem #	and outputs lines in F2 with a first word that is in F1.
     33  1.5    apb #	The first word is canonicalised via vis(unvis(word))).
     34  1.5    apb #	Neither file needs to be sorted.
     35  1.1  lukem 
     36  1.4    apb function unvis(s) \
     37  1.4    apb {
     38  1.4    apb 	# XXX: We don't handle the complete range of vis encodings
     39  1.4    apb 	unvis_result = ""
     40  1.4    apb 	while (length(s) > 0) {
     41  1.4    apb 		unvis_pos = match(s, "\\\\.")
     42  1.4    apb 		if (unvis_pos == 0) {
     43  1.4    apb 			unvis_result = unvis_result "" s
     44  1.4    apb 			s = ""
     45  1.4    apb 			break
     46  1.4    apb 		}
     47  1.4    apb 		# copy the part before the next backslash
     48  1.4    apb 		unvis_result = unvis_result "" substr(s, 1, unvis_pos - 1)
     49  1.4    apb 		s = substr(s, unvis_pos)
     50  1.4    apb 		# process the backslash and next few chars
     51  1.4    apb 		if (substr(s, 1, 2) == "\\\\") {
     52  1.4    apb 			# double backslash -> single backslash
     53  1.4    apb 			unvis_result = unvis_result "\\"
     54  1.4    apb 			s = substr(s, 3)
     55  1.4    apb 		} else if (match(s, "\\\\[0-7][0-7][0-7]") == 1) {
     56  1.4    apb 			# \ooo with three octal digits.
     57  1.5    apb 			# XXX: use strtonum() when that is available
     58  1.4    apb 			unvis_result = unvis_result "" sprintf("%c", \
     59  1.4    apb 				0+substr(s, 2, 1) * 64 + \
     60  1.4    apb 				0+substr(s, 3, 1) * 8 + \
     61  1.4    apb 				0+substr(s, 4, 1))
     62  1.4    apb 			s = substr(s, 5)
     63  1.4    apb 		} else {
     64  1.4    apb 			# unrecognised escape: keep the literal backslash
     65  1.4    apb 			printf "%s: %s:%s: unrecognised escape %s\n", \
     66  1.4    apb 				ARGV[0], (FILENAME ? FILENAME : "stdin"), FNR, \
     67  1.4    apb 				substr(s, 1, 2) \
     68  1.4    apb 				>"/dev/stderr"
     69  1.4    apb 			unvis_result = unvis_result "" substr(s, 1, 1)
     70  1.4    apb 			s = substr(s, 2)
     71  1.4    apb 		}
     72  1.4    apb 	}
     73  1.4    apb 	return unvis_result
     74  1.4    apb }
     75  1.4    apb 
     76  1.5    apb function vis(s) \
     77  1.5    apb {
     78  1.5    apb 	# We need to encode backslash, space, and tab, because they
     79  1.5    apb 	# would interfere with scripts that attempt to manipulate
     80  1.5    apb 	# the set files.
     81  1.5    apb 	#
     82  1.5    apb 	# We make no attempt to encode shell special characters
     83  1.5    apb 	# such as " ' $ ( ) { } [ ] < > * ?, because nothing that
     84  1.5    apb 	# parses set files would need that.
     85  1.5    apb 	#
     86  1.5    apb 	# We would like to handle other white space or non-graph
     87  1.5    apb 	# characters, because they may be confusing for human readers,
     88  1.5    apb 	# but they are too difficult to handle in awk without the ord()
     89  1.5    apb 	# function, so we print an error message.
     90  1.5    apb 	#
     91  1.5    apb 	# As of October 2014, no files in the set lists contain
     92  1.5    apb 	# characters that would need any kind of encoding.
     93  1.5    apb 	#
     94  1.5    apb 	vis_result = ""
     95  1.5    apb 	while (length(s) > 0) {
     96  1.5    apb 		vis_pos = match(s, "(\\\\|[[:space:]]|[^[:graph:]])")
     97  1.5    apb 		if (vis_pos == 0) {
     98  1.5    apb 			vis_result = vis_result "" s
     99  1.5    apb 			s = ""
    100  1.5    apb 			break
    101  1.5    apb 		}
    102  1.5    apb 		# copy the part before the next special char
    103  1.5    apb 		vis_result = vis_result "" substr(s, 1, vis_pos - 1)
    104  1.5    apb 		vis_char = substr(s, vis_pos, 1)
    105  1.5    apb 		s = substr(s, vis_pos + 1)
    106  1.5    apb 		# process the special char
    107  1.5    apb 		if (vis_char == "\\") {
    108  1.5    apb 			# backslash -> double backslash
    109  1.5    apb 			vis_result = vis_result "\\\\"
    110  1.5    apb 		} else if (vis_char == " ") {
    111  1.5    apb 			# space -> \040
    112  1.5    apb 			vis_result = vis_result "\\040"
    113  1.5    apb 		} else if (vis_char == "\t") {
    114  1.5    apb 			# tab -> \011
    115  1.5    apb 			vis_result = vis_result "\\011"
    116  1.5    apb 		} else {
    117  1.5    apb 			# generalised \ooo with three octal digits.
    118  1.5    apb 			# XXX: I don't know how to do this in awk without ord()
    119  1.5    apb 			printf "%s: %s:%s: cannot perform vis encoding\n", \
    120  1.5    apb 				ARGV[0], (FILENAME ? FILENAME : "stdin"), FNR \
    121  1.5    apb 				>"/dev/stderr"
    122  1.5    apb 			vis_result = vis_result "" vis_char
    123  1.5    apb 		}
    124  1.5    apb 	}
    125  1.5    apb 	return vis_result
    126  1.5    apb }
    127  1.5    apb 
    128  1.5    apb // { $1 = vis(unvis($1)); print }
    129  1.5    apb 
    130  1.1  lukem BEGIN \
    131  1.1  lukem {
    132  1.1  lukem 	if (ARGC != 3) {
    133  1.2  lukem 		printf("Usage: join file1 file2\n") >"/dev/stderr"
    134  1.2  lukem 		exit 1
    135  1.1  lukem 	}
    136  1.4    apb 	while ( (getline < ARGV[1]) > 0) {
    137  1.5    apb 		$1 = vis(unvis($1))
    138  1.2  lukem 		words[$1] = $0
    139  1.4    apb 	}
    140  1.2  lukem 	delete ARGV[1]
    141  1.1  lukem }
    142  1.1  lukem 
    143  1.5    apb // { $1 = vis(unvis($1)) }
    144  1.4    apb 
    145  1.2  lukem $1 in words \
    146  1.2  lukem {
    147  1.2  lukem 	f1=$1
    148  1.2  lukem 	$1=""
    149  1.2  lukem 	print words[f1] $0
    150  1.2  lukem }
    151