join.awk revision 1.5 1 # $NetBSD: join.awk,v 1.5 2014/10/23 14:19:33 apb Exp $
2 #
3 # Copyright (c) 2002 The NetBSD Foundation, Inc.
4 # All rights reserved.
5 #
6 # This code is derived from software contributed to The NetBSD Foundation
7 # by Luke Mewburn of Wasabi Systems.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 # 1. Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # 2. Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and/or other materials provided with the distribution.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 # POSSIBILITY OF SUCH DAMAGE.
29 #
30 # join.awk F1 F2
31 # Similar to join(1), this reads a list of words from F1
32 # and outputs lines in F2 with a first word that is in F1.
33 # The first word is canonicalised via vis(unvis(word))).
34 # Neither file needs to be sorted.
35
36 function unvis(s) \
37 {
38 # XXX: We don't handle the complete range of vis encodings
39 unvis_result = ""
40 while (length(s) > 0) {
41 unvis_pos = match(s, "\\\\.")
42 if (unvis_pos == 0) {
43 unvis_result = unvis_result "" s
44 s = ""
45 break
46 }
47 # copy the part before the next backslash
48 unvis_result = unvis_result "" substr(s, 1, unvis_pos - 1)
49 s = substr(s, unvis_pos)
50 # process the backslash and next few chars
51 if (substr(s, 1, 2) == "\\\\") {
52 # double backslash -> single backslash
53 unvis_result = unvis_result "\\"
54 s = substr(s, 3)
55 } else if (match(s, "\\\\[0-7][0-7][0-7]") == 1) {
56 # \ooo with three octal digits.
57 # XXX: use strtonum() when that is available
58 unvis_result = unvis_result "" sprintf("%c", \
59 0+substr(s, 2, 1) * 64 + \
60 0+substr(s, 3, 1) * 8 + \
61 0+substr(s, 4, 1))
62 s = substr(s, 5)
63 } else {
64 # unrecognised escape: keep the literal backslash
65 printf "%s: %s:%s: unrecognised escape %s\n", \
66 ARGV[0], (FILENAME ? FILENAME : "stdin"), FNR, \
67 substr(s, 1, 2) \
68 >"/dev/stderr"
69 unvis_result = unvis_result "" substr(s, 1, 1)
70 s = substr(s, 2)
71 }
72 }
73 return unvis_result
74 }
75
76 function vis(s) \
77 {
78 # We need to encode backslash, space, and tab, because they
79 # would interfere with scripts that attempt to manipulate
80 # the set files.
81 #
82 # We make no attempt to encode shell special characters
83 # such as " ' $ ( ) { } [ ] < > * ?, because nothing that
84 # parses set files would need that.
85 #
86 # We would like to handle other white space or non-graph
87 # characters, because they may be confusing for human readers,
88 # but they are too difficult to handle in awk without the ord()
89 # function, so we print an error message.
90 #
91 # As of October 2014, no files in the set lists contain
92 # characters that would need any kind of encoding.
93 #
94 vis_result = ""
95 while (length(s) > 0) {
96 vis_pos = match(s, "(\\\\|[[:space:]]|[^[:graph:]])")
97 if (vis_pos == 0) {
98 vis_result = vis_result "" s
99 s = ""
100 break
101 }
102 # copy the part before the next special char
103 vis_result = vis_result "" substr(s, 1, vis_pos - 1)
104 vis_char = substr(s, vis_pos, 1)
105 s = substr(s, vis_pos + 1)
106 # process the special char
107 if (vis_char == "\\") {
108 # backslash -> double backslash
109 vis_result = vis_result "\\\\"
110 } else if (vis_char == " ") {
111 # space -> \040
112 vis_result = vis_result "\\040"
113 } else if (vis_char == "\t") {
114 # tab -> \011
115 vis_result = vis_result "\\011"
116 } else {
117 # generalised \ooo with three octal digits.
118 # XXX: I don't know how to do this in awk without ord()
119 printf "%s: %s:%s: cannot perform vis encoding\n", \
120 ARGV[0], (FILENAME ? FILENAME : "stdin"), FNR \
121 >"/dev/stderr"
122 vis_result = vis_result "" vis_char
123 }
124 }
125 return vis_result
126 }
127
128 // { $1 = vis(unvis($1)); print }
129
130 BEGIN \
131 {
132 if (ARGC != 3) {
133 printf("Usage: join file1 file2\n") >"/dev/stderr"
134 exit 1
135 }
136 while ( (getline < ARGV[1]) > 0) {
137 $1 = vis(unvis($1))
138 words[$1] = $0
139 }
140 delete ARGV[1]
141 }
142
143 // { $1 = vis(unvis($1)) }
144
145 $1 in words \
146 {
147 f1=$1
148 $1=""
149 print words[f1] $0
150 }
151