R.Muralikrishnan, MPI for Empirical Aesthetics. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Licence. #!/usr/bin/gawk -f #########################################################################O # AWK Script to parse html output of curl from freiburger-anthologie.de # # Substitute all the html tags with nothing; and what remains is # ... the text of interest. # Since the html pages on freiburger-anthologie.de are in ISO8859-1 encoding, # the input to this awk script must first be converted to UTF-8 for the # Umlaut conversions below to work. Otherwise bad is made worse! # # Execute as: awk -f Parse-Html-Output--Freiburg-Anthology.awk Input-UTF-8.txt > Output.txt # # Author: R.Muralikrishnan #########################################################################m # curl 'http://freiburger-anthologie.ub.uni-freiburg.de/fa/fa.pl?cmd=gedichte&sub=show&add=&print=1&spalten=&id=1' -o Temp.txt # Temp.txt is in ISO8859-1 encoding from the source. So first re-encode it to UTF-8 using: # iconv -f iso8859-1 -t utf-8 # before feeding iconv's output to this script. # Otherwise the gsub commands for umlauts won't work. # Ref: http://stackoverflow.com/questions/10172327/linux-curl-save-as-utf-8#10172716 { gsub(/^[ ]*/,"",$0); # Remove initial spaces # It is important to do it here; otherwise the gsub for <something won't work! gsub(/<[A-Za-z0-9=\/\"\.\-:;! ]*>/,"",$0); # Remove patterns of the sort <anything> gsub(/^<..*/,"",$0); # Remove patterns of the sort <something gsub(/..*\">/,"",$0); # Remove patterns of the sort something"> # A . is any single char; .* is 0 or more chars; # So we need ..* to account for 1 or more chars in the above commands gsub(/^cellspacing.*/,"",$0); # Remove the line: cellspacing="0" cellpadding="3" # This line doesn't fall into the patterns containing a > # So it's dealt with separately here. gsub(/^©.*/, "", $0); # Remove the copyright line at the end, starting with: © 20copy; gsub(/\(v. 1.99\)/,"", $0); # Remove the very last line containing the version number: (v. 1.99) # The files from http://freiburger-anthologie.de/ are in ISO8859-1 encoding; # Using iconv # And the Umlaut characters end up being pathetic; convert them to their rightful orthography here. gsub(/Ä/,"Ä",$0); gsub(/Ö/,"Ö",$0); gsub(/Ü/,"Ü",$0); gsub(/ä/,"ä",$0); gsub(/ö/,"ö",$0); gsub(/ü/,"ü",$0); gsub(/ß/,"ß",$0); if ($0 != "") {print $0}; }