nanpa.sed revision 1.3
11.3Sjmcneill# $NetBSD: nanpa.sed,v 1.3 2023/01/28 13:12:16 jmcneill Exp $
21.1Sjhawk#
31.1Sjhawk# Parse HTML tables output by 
41.1Sjhawk#   http://docs.nanpa.com/cgi-bin/npa_reports/nanpa
51.1Sjhawk# Specifically, for each html table row (TR),
61.2Swiz# print the <TD> elements separated by colons.
71.1Sjhawk#
81.1Sjhawk# This could break on HTML comments.
91.1Sjhawk#
101.1Sjhawk:top
111.1Sjhawk#				Strip ^Ms
121.1Sjhawks/
131.1Sjhawk//g
141.1Sjhawk#				Join all lines with unterminated HTML tags
151.1Sjhawk/<[^>]*$/{
161.1Sjhawk	N
171.1Sjhawk	b top
181.1Sjhawk}
191.1Sjhawk#				Replace all </TR> with EOL tag
201.1Sjhawks;</[Tt][Rr]>;$;g
211.1Sjhawk# 				Join lines with only <TR>.
221.1Sjhawk/<[Tt][Rr][^>]*>$/{
231.1Sjhawk	N
241.1Sjhawk	s/\n//g
251.1Sjhawk	b top
261.1Sjhawk}
271.1Sjhawk#				Also, join all lines starting with <TR>.
281.1Sjhawk/<[TtRr][^>]*>[^$]*$/{
291.1Sjhawk	N
301.1Sjhawk	s/\n//g
311.1Sjhawk	b top
321.1Sjhawk}
331.1Sjhawk#				Remove EOL markers
341.1Sjhawks/\$$//
351.1Sjhawk#				Remove lines not starting with <TR>
361.1Sjhawk/<[Tt][Rr][^>]*>/!d
371.3Sjmcneill#				Replace all <TD> with colon
381.1Sjhawks/[ 	]*<[Tt][Dd][^>]*> */:/g
391.1Sjhawk#				Strip all HTML tags
401.1Sjhawks/<[^>]*>//g
411.1Sjhawk#				Handle HTML characters
421.1Sjhawks/&nbsp;/ /g
431.1Sjhawk#				Compress spaces/tabs
441.1Sjhawks/[ 	][ 	]*/ /g
451.3Sjmcneill#				Strip leading colons
461.1Sjhawks/://
471.3Sjmcneill#				Strip leading/trailing whitespace
481.1Sjhawks/ *//
491.3Sjmcneills/ $//
501.3Sjmcneill#				Strip HTML comments
51s/^--.*$//
52