Creative Commons License R.Muralikrishnan, MPI for Empirical Aesthetics. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Licence.
#!/usr/bin/gawk -f
#########################################################################O
# AWK Script to parse html output of curl from freiburger-anthologie.de
#
# Substitute all the html tags with nothing; and what remains is
# ... the text of interest.
# Since the html pages on freiburger-anthologie.de are in ISO8859-1 encoding,
# the input to this awk script must first be converted to UTF-8 for the
# Umlaut conversions below to work.  Otherwise bad is made worse!
#
# Execute as: awk -f Parse-Html-Output--Freiburg-Anthology.awk Input-UTF-8.txt > Output.txt
#
# Author: R.Muralikrishnan
#########################################################################m



# curl 'http://freiburger-anthologie.ub.uni-freiburg.de/fa/fa.pl?cmd=gedichte&sub=show&add=&print=1&spalten=&id=1' -o Temp.txt
# Temp.txt is in ISO8859-1 encoding from the source.  So first re-encode it to UTF-8 using: 
# iconv -f iso8859-1 -t utf-8
# before feeding iconv's output to this script. 
# Otherwise the gsub commands for umlauts won't work.
# Ref: http://stackoverflow.com/questions/10172327/linux-curl-save-as-utf-8#10172716




{
    gsub(/^[ ]*/,"",$0);                          # Remove initial spaces
                                                  # It is important to do it here; otherwise the gsub for <something won't work!

    gsub(/<[A-Za-z0-9=\/\"\.\-:;! ]*>/,"",$0);    # Remove patterns of the sort <anything>
    gsub(/^<..*/,"",$0);                          # Remove patterns of the sort <something
    gsub(/..*\">/,"",$0);                         # Remove patterns of the sort something">
                                                  # A . is any single char; .* is 0 or more chars; 
                                                  # So we need ..* to account for 1 or more chars in the above commands

    gsub(/^cellspacing.*/,"",$0);                 # Remove the line: cellspacing="0" cellpadding="3"
                                                  # This line doesn't fall into the patterns containing a >
                                                  # So it's dealt with separately here.


    gsub(/^&copy;.*/, "", $0);                    # Remove the copyright line at the end, starting with: &copy; 20copy;
    gsub(/\(v. 1.99\)/,"", $0);                   # Remove the very last line containing the version number: (v. 1.99)
    
    # The files from http://freiburger-anthologie.de/ are in ISO8859-1 encoding;
    # Using iconv 
    # And the Umlaut characters end up being pathetic; convert them to their rightful orthography here.

    gsub(/&Auml;/,"Ä",$0);
    gsub(/&Ouml;/,"Ö",$0);
    gsub(/&Uuml;/,"Ü",$0);
    gsub(/&auml;/,"ä",$0);
    gsub(/&ouml;/,"ö",$0);
    gsub(/&uuml;/,"ü",$0);
    gsub(/&szlig;/,"ß",$0);
     
    if ($0 != "") {print $0};
}