-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlist_urls.sed
executable file
·39 lines (29 loc) · 1.07 KB
/
list_urls.sed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#! /bin/sed -nf
# Join lines if we have tags that span multiple lines
:join
/<[^>]*$/ { N; s/[ *]\n[ *]/ /; b join; }
# Do some selection to speed the thing up
/<[ ]*\([aA]\|[iI][mM][gG]\)/!b
# Remove extra spaces before/after the tag name, change img/area to a
s/<[ ]*\([aA]\|[iI][mM][gG]|[aA][rR][eE][aA]\)[ ]\+/<a /g
# To simplify the regexps that follow, change href/alt to lowercase
# and replace whitespace before them with a single space
s/<a\([^>]*\)[ ][hH][rR][eE][fF]=/<a\1 href=/g
s/<a\([^>]*\)[ ][aA][lL][tT]=/<a\1 alt=/g
# To simplify the regexps that follow, quote the arguments to href and alt
s/href=\([^" >]\+\)/href="\1"/g
s/alt=\([^" >]\+\)/alt="\1"/g
# Move the alt tag after href, remove attributes between them
s/\( alt="[^"]*"\)[^>]*\( href="[^"]*"\)/\2\1/g
# Remove attributes between <a and href
s/<a[^>]* href="/<a href="/g
# Change href="xxx" ... alt="yyy" to href="xxx|yyy"
s/\(<a href="[^"]*\)"[^>]* alt="\([^"]*"\)/\1|\2/g
t loop
# Print an URL, remove it, and loop
:loop
h
s/.*<a href="\([^"]*\)".*$/\1/p
g
s/\(.*\)<a href="\([^"]*\)".*$/\1/
t loop